diff --git a/.gitignore b/.gitignore
index 3e6b5557..a6d9c1d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,7 @@ test/temp_fullydistvec
 *.o
 SpGEMM3D
 mcl3d
+_build
+_install
+.clangd
+.cache
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000..fff80899
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,76 @@
+{
+    "files.associations": {
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "cctype": "cpp",
+        "chrono": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "codecvt": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "condition_variable": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "forward_list": "cpp",
+        "list": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "ratio": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "semaphore": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "stop_token": "cpp",
+        "streambuf": "cpp",
+        "thread": "cpp",
+        "cinttypes": "cpp",
+        "typeindex": "cpp",
+        "typeinfo": "cpp",
+        "__nullptr": "cpp",
+        "filesystem": "cpp",
+        "__locale": "cpp",
+        "locale": "cpp",
+        "__config": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/3DSpGEMM/matlab/mmwrite.m b/3DSpGEMM/matlab/mmwrite.m
index babeb5c9..53a71b7f 100644
--- a/3DSpGEMM/matlab/mmwrite.m
+++ b/3DSpGEMM/matlab/mmwrite.m
@@ -1,274 +1,274 @@
-function [ err ] = mmwrite(filename,A,comment,field,precision)
-%
-% Function: mmwrite(filename,A,comment,field,precision)
-%
-%    Writes the sparse or dense matrix A to a Matrix Market (MM) 
-%    formatted file.
-%
-% Required arguments: 
-%
-%                 filename  -  destination file
-%
-%                 A         -  sparse or full matrix
-%
-% Optional arguments: 
-%
-%                 comment   -  matrix of comments to prepend to
-%                              the MM file.  To build a comment matrix,
-%                              use str2mat. For example:
-%
-%                              comment = str2mat(' Comment 1' ,...
-%                                                ' Comment 2',...
-%                                                ' and so on.',...
-%                                                ' to attach a date:',...
-%                                                [' ',date]);
-%                              If ommitted, a single line date stamp comment
-%                              will be included.
-%
-%                 field     -  'real'
-%                              'complex'
-%                              'integer'
-%                              'pattern'
-%                              If ommitted, data will determine type.
-%
-%                 precision -  number of digits to display for real 
-%                              or complex values
-%                              If ommitted, full working precision is used.
-%
-
-if ( nargin == 5) 
-  precision = 16;
-elseif ( nargin == 4) 
-  precision = 16;
-elseif ( nargin == 3) 
-  mattype = 'real'; % placeholder, will check after FIND-ing A
-  precision = 16;
-elseif ( nargin == 2) 
-  comment = '';
-  % Check whether there is an imaginary part:
-  mattype = 'real'; % placeholder, will check after FIND-ing A
-  precision = 16;
-end
-
-mmfile = fopen([filename],'w');
-if ( mmfile == -1 )
- error('Cannot open file for output');
-end;
-
-
-[M,N] = size(A);
-
-%%%%%%%%%%%%%       This part for sparse matrices     %%%%%%%%%%%%%%%%
-if ( issparse(A) )
-
-  [I,J,V] = find(A);
-  if ( sum(abs(imag(nonzeros(V)))) > 0 )
-    Vreal = 0; 
-  else 
-    Vreal = 1; 
-  end
-
-  if ( ~ strcmp(mattype,'pattern') & Vreal )
-    mattype = 'real'; 
-  elseif ( ~ strcmp(mattype,'pattern') )
-    mattype = 'complex';
-  end
-%
-% Determine symmetry:
-%
-  if ( M ~= N )
-    symm = 'general';
-    issymm = 0;
-    NZ = length(V);
-  else
-    issymm = 1;
-    NZ = length(V);
-    for i=1:NZ
-      if ( A(J(i),I(i)) ~= V(i) )
-        issymm = 0;
-        break;
-      end
-    end
-    if ( issymm )
-      symm = 'symmetric';
-      ATEMP = tril(A);
-      [I,J,V] = find(ATEMP);
-      NZ = nnz(ATEMP);
-    else
-      isskew = 1;
-      for i=1:NZ
-        if ( A(J(i),I(i)) ~= - V(i) )
-          isskew = 0;
-          break;
-        end
-      end
-      if ( isskew )
-        symm = 'skew-symmetric';
-        ATEMP = tril(A);
-        [I,J,V] = find(ATEMP);
-        NZ = nnz(ATEMP);
-      elseif ( strcmp(mattype,'complex') )
-        isherm = 1;
-        for i=1:NZ
-          if ( A(J(i),I(i)) ~= conj(V(i)) )
-            isherm = 0;
-            break;
-          end
-        end
-        if ( isherm )
-          symm = 'hermitian';
-          ATEMP = tril(A);
-          [I,J,V] = find(ATEMP);
-          NZ = nnz(ATEMP);
-        else 
-          symm = 'general';
-          NZ = nnz(A);
-        end
-      else
-        symm = 'general';
-        NZ = nnz(A);
-      end
-    end
-  end
-
-% Sparse coordinate format:
-
-  rep = 'coordinate';
-
-
-  fprintf(mmfile,'%%%%MatrixMarket matrix %s %s %s\n',rep,mattype,symm);
-  [MC,NC] = size(comment);
-  if ( MC == 0 )
-    fprintf(mmfile,'%% Generated %s\n',[date]);
-  else
-    for i=1:MC,
-      fprintf(mmfile,'%%%s\n',comment(i,:));
-    end
-  end
-  fprintf(mmfile,'%d %d %d\n',M,N,NZ);
-  cplxformat = sprintf('%%d %%d %% .%dg %% .%dg\n',precision,precision);
-  realformat = sprintf('%%d %%d %% .%dg\n',precision);
-  if ( strcmp(mattype,'real') )
-     for i=1:NZ
-        fprintf(mmfile,realformat,I(i),J(i),V(i));
-     end;
-  elseif ( strcmp(mattype,'complex') )
-  for i=1:NZ
-     fprintf(mmfile,cplxformat,I(i),J(i),real(V(i)),imag(V(i)));
-  end;
-  elseif ( strcmp(mattype,'pattern') )
-     for i=1:NZ
-        fprintf(mmfile,'%d %d\n',I(i),J(i));
-     end;
-  else  
-     err = -1;
-     disp('Unsupported mattype:')
-     mattype
-  end;
-
-%%%%%%%%%%%%%       This part for dense matrices      %%%%%%%%%%%%%%%%
-else
-  if ( sum(abs(imag(nonzeros(A)))) > 0 )
-    Areal = 0; 
-  else 
-    Areal = 1; 
-  end
-  if ( ~strcmp(mattype,'pattern') & Areal )
-    mattype = 'real';
-  elseif ( ~strcmp(mattype,'pattern')  )
-    mattype = 'complex';
-  end
-%
-% Determine symmetry:
-%
-  if ( M ~= N )
-    issymm = 0;
-    symm = 'general';
-  else
-    issymm = 1;
-    for j=1:N 
-      for i=j+1:N
-        if (A(i,j) ~= A(j,i) )
-          issymm = 0;   
-          break; 
-        end
-      end
-      if ( ~ issymm ) break; end
-    
-    end
-    if ( issymm )
-      symm = 'symmetric';
-    else
-      isskew = 1;
-      for j=1:N 
-        for i=j+1:N
-          if (A(i,j) ~= - A(j,i) )
-            isskew = 0;   
-            break; 
-          end
-        end
-        if ( ~ isskew ) break; end
-      end
-      if ( isskew )
-        symm = 'skew-symmetric';
-      elseif ( strcmp(mattype,'complex') )
-        isherm = 1;
-        for j=1:N 
-          for i=j+1:N
-            if (A(i,j) ~= conj(A(j,i)) )
-              isherm = 0;   
-              break; 
-            end
-          end
-          if ( ~ isherm ) break; end
-        end
-        if ( isherm )
-          symm = 'hermitian';
-        else 
-          symm = 'general';
-        end
-      else
-        symm = 'general';
-      end
-    end
-  end
-
-% Dense array format:
-
-  rep = 'array';
-  [MC,NC] = size(comment);
-  fprintf(mmfile,'%%%%MatrixMarket matrix %s %s %s\n',rep,mattype,symm);
-  for i=1:MC,
-    fprintf(mmfile,'%%%s\n',comment(i,:));
-  end;
-  fprintf(mmfile,'%d %d\n',M,N);
-  cplxformat = sprintf('%% .%dg %% .%dg\n', precision,precision);
-  realformat = sprintf('%% .%dg\n', precision);
-  if ( ~ strcmp(symm,'general') )
-     rowloop = 'j';
-  else 
-     rowloop = '1';
-  end
-  if ( strcmp(mattype,'real') )
-     for j=1:N
-       for i=eval(rowloop):M
-          fprintf(mmfile,realformat,A(i,j));
-       end
-     end
-  elseif ( strcmp(mattype,'complex') )
-     for j=1:N
-       for i=eval(rowloop):M
-          fprintf(mmfile,cplxformat,real(A(i,j)),imag(A(i,j)));
-       end
-     end
-  elseif ( strcmp(mattype,'pattern') )
-     err = -2
-     disp('Pattern type inconsistant with dense matrix')
-  else
-     err = -2
-     disp('Unknown matrix type:')
-     mattype
-  end
-end
-
-fclose(mmfile);
+function [ err ] = mmwrite(filename,A,comment,field,precision)
+%
+% Function: mmwrite(filename,A,comment,field,precision)
+%
+%    Writes the sparse or dense matrix A to a Matrix Market (MM) 
+%    formatted file.
+%
+% Required arguments: 
+%
+%                 filename  -  destination file
+%
+%                 A         -  sparse or full matrix
+%
+% Optional arguments: 
+%
+%                 comment   -  matrix of comments to prepend to
+%                              the MM file.  To build a comment matrix,
+%                              use str2mat. For example:
+%
+%                              comment = str2mat(' Comment 1' ,...
+%                                                ' Comment 2',...
+%                                                ' and so on.',...
+%                                                ' to attach a date:',...
+%                                                [' ',date]);
+%                              If ommitted, a single line date stamp comment
+%                              will be included.
+%
+%                 field     -  'real'
+%                              'complex'
+%                              'integer'
+%                              'pattern'
+%                              If ommitted, data will determine type.
+%
+%                 precision -  number of digits to display for real 
+%                              or complex values
+%                              If ommitted, full working precision is used.
+%
+
+if ( nargin == 5) 
+  precision = 16;
+elseif ( nargin == 4) 
+  precision = 16;
+elseif ( nargin == 3) 
+  mattype = 'real'; % placeholder, will check after FIND-ing A
+  precision = 16;
+elseif ( nargin == 2) 
+  comment = '';
+  % Check whether there is an imaginary part:
+  mattype = 'real'; % placeholder, will check after FIND-ing A
+  precision = 16;
+end
+
+mmfile = fopen([filename],'w');
+if ( mmfile == -1 )
+ error('Cannot open file for output');
+end;
+
+
+[M,N] = size(A);
+
+%%%%%%%%%%%%%       This part for sparse matrices     %%%%%%%%%%%%%%%%
+if ( issparse(A) )
+
+  [I,J,V] = find(A);
+  if ( sum(abs(imag(nonzeros(V)))) > 0 )
+    Vreal = 0; 
+  else 
+    Vreal = 1; 
+  end
+
+  if ( ~ strcmp(mattype,'pattern') & Vreal )
+    mattype = 'real'; 
+  elseif ( ~ strcmp(mattype,'pattern') )
+    mattype = 'complex';
+  end
+%
+% Determine symmetry:
+%
+  if ( M ~= N )
+    symm = 'general';
+    issymm = 0;
+    NZ = length(V);
+  else
+    issymm = 1;
+    NZ = length(V);
+    for i=1:NZ
+      if ( A(J(i),I(i)) ~= V(i) )
+        issymm = 0;
+        break;
+      end
+    end
+    if ( issymm )
+      symm = 'symmetric';
+      ATEMP = tril(A);
+      [I,J,V] = find(ATEMP);
+      NZ = nnz(ATEMP);
+    else
+      isskew = 1;
+      for i=1:NZ
+        if ( A(J(i),I(i)) ~= - V(i) )
+          isskew = 0;
+          break;
+        end
+      end
+      if ( isskew )
+        symm = 'skew-symmetric';
+        ATEMP = tril(A);
+        [I,J,V] = find(ATEMP);
+        NZ = nnz(ATEMP);
+      elseif ( strcmp(mattype,'complex') )
+        isherm = 1;
+        for i=1:NZ
+          if ( A(J(i),I(i)) ~= conj(V(i)) )
+            isherm = 0;
+            break;
+          end
+        end
+        if ( isherm )
+          symm = 'hermitian';
+          ATEMP = tril(A);
+          [I,J,V] = find(ATEMP);
+          NZ = nnz(ATEMP);
+        else 
+          symm = 'general';
+          NZ = nnz(A);
+        end
+      else
+        symm = 'general';
+        NZ = nnz(A);
+      end
+    end
+  end
+
+% Sparse coordinate format:
+
+  rep = 'coordinate';
+
+
+  fprintf(mmfile,'%%%%MatrixMarket matrix %s %s %s\n',rep,mattype,symm);
+  [MC,NC] = size(comment);
+  if ( MC == 0 )
+    fprintf(mmfile,'%% Generated %s\n',[date]);
+  else
+    for i=1:MC,
+      fprintf(mmfile,'%%%s\n',comment(i,:));
+    end
+  end
+  fprintf(mmfile,'%d %d %d\n',M,N,NZ);
+  cplxformat = sprintf('%%d %%d %% .%dg %% .%dg\n',precision,precision);
+  realformat = sprintf('%%d %%d %% .%dg\n',precision);
+  if ( strcmp(mattype,'real') )
+     for i=1:NZ
+        fprintf(mmfile,realformat,I(i),J(i),V(i));
+     end;
+  elseif ( strcmp(mattype,'complex') )
+  for i=1:NZ
+     fprintf(mmfile,cplxformat,I(i),J(i),real(V(i)),imag(V(i)));
+  end;
+  elseif ( strcmp(mattype,'pattern') )
+     for i=1:NZ
+        fprintf(mmfile,'%d %d\n',I(i),J(i));
+     end;
+  else  
+     err = -1;
+     disp('Unsupported mattype:')
+     mattype
+  end;
+
+%%%%%%%%%%%%%       This part for dense matrices      %%%%%%%%%%%%%%%%
+else
+  if ( sum(abs(imag(nonzeros(A)))) > 0 )
+    Areal = 0; 
+  else 
+    Areal = 1; 
+  end
+  if ( ~strcmp(mattype,'pattern') & Areal )
+    mattype = 'real';
+  elseif ( ~strcmp(mattype,'pattern')  )
+    mattype = 'complex';
+  end
+%
+% Determine symmetry:
+%
+  if ( M ~= N )
+    issymm = 0;
+    symm = 'general';
+  else
+    issymm = 1;
+    for j=1:N 
+      for i=j+1:N
+        if (A(i,j) ~= A(j,i) )
+          issymm = 0;   
+          break; 
+        end
+      end
+      if ( ~ issymm ) break; end
+    
+    end
+    if ( issymm )
+      symm = 'symmetric';
+    else
+      isskew = 1;
+      for j=1:N 
+        for i=j+1:N
+          if (A(i,j) ~= - A(j,i) )
+            isskew = 0;   
+            break; 
+          end
+        end
+        if ( ~ isskew ) break; end
+      end
+      if ( isskew )
+        symm = 'skew-symmetric';
+      elseif ( strcmp(mattype,'complex') )
+        isherm = 1;
+        for j=1:N 
+          for i=j+1:N
+            if (A(i,j) ~= conj(A(j,i)) )
+              isherm = 0;   
+              break; 
+            end
+          end
+          if ( ~ isherm ) break; end
+        end
+        if ( isherm )
+          symm = 'hermitian';
+        else 
+          symm = 'general';
+        end
+      else
+        symm = 'general';
+      end
+    end
+  end
+
+% Dense array format:
+
+  rep = 'array';
+  [MC,NC] = size(comment);
+  fprintf(mmfile,'%%%%MatrixMarket matrix %s %s %s\n',rep,mattype,symm);
+  for i=1:MC,
+    fprintf(mmfile,'%%%s\n',comment(i,:));
+  end;
+  fprintf(mmfile,'%d %d\n',M,N);
+  cplxformat = sprintf('%% .%dg %% .%dg\n', precision,precision);
+  realformat = sprintf('%% .%dg\n', precision);
+  if ( ~ strcmp(symm,'general') )
+     rowloop = 'j';
+  else 
+     rowloop = '1';
+  end
+  if ( strcmp(mattype,'real') )
+     for j=1:N
+       for i=eval(rowloop):M
+          fprintf(mmfile,realformat,A(i,j));
+       end
+     end
+  elseif ( strcmp(mattype,'complex') )
+     for j=1:N
+       for i=eval(rowloop):M
+          fprintf(mmfile,cplxformat,real(A(i,j)),imag(A(i,j)));
+       end
+     end
+  elseif ( strcmp(mattype,'pattern') )
+     err = -2
+     disp('Pattern type inconsistant with dense matrix')
+  else
+     err = -2
+     disp('Unknown matrix type:')
+     mattype
+  end
+end
+
+fclose(mmfile);
diff --git a/Applications/CMakeLists.txt b/Applications/CMakeLists.txt
index a5fb38e9..1d978f5f 100644
--- a/Applications/CMakeLists.txt
+++ b/Applications/CMakeLists.txt
@@ -1,11 +1,10 @@
 # Top level directory has the include files
 
-
 ADD_EXECUTABLE( tdbfs TopDownBFS.cpp )
 ADD_EXECUTABLE( dobfs DirOptBFS.cpp )
 ADD_EXECUTABLE( fbfs FilteredBFS.cpp )
 ADD_EXECUTABLE( fmis FilteredMIS.cpp )
-ADD_EXECUTABLE( mcl MCL.cpp )
+#ADD_EXECUTABLE( mcl MCL.cpp )
 ADD_EXECUTABLE( betwcent BetwCent.cpp )
 ADD_EXECUTABLE( lacc CC.cpp)
 
@@ -13,7 +12,7 @@ TARGET_LINK_LIBRARIES( tdbfs CombBLAS)
 TARGET_LINK_LIBRARIES( dobfs CombBLAS)
 TARGET_LINK_LIBRARIES( fbfs CombBLAS)
 TARGET_LINK_LIBRARIES( fmis CombBLAS)
-TARGET_LINK_LIBRARIES( mcl CombBLAS)
+#TARGET_LINK_LIBRARIES( mcl CombBLAS)
 TARGET_LINK_LIBRARIES( betwcent CombBLAS)
 TARGET_LINK_LIBRARIES( lacc CombBLAS)
 
diff --git a/Applications/Incremental/CMakeLists.txt b/Applications/Incremental/CMakeLists.txt
index 1ce6c2b6..6dd61baf 100644
--- a/Applications/Incremental/CMakeLists.txt
+++ b/Applications/Incremental/CMakeLists.txt
@@ -10,7 +10,7 @@ ADD_EXECUTABLE(full Full.cpp)
 ADD_EXECUTABLE(testideas Test.cpp)
 ADD_EXECUTABLE(prep-data Prep-Data.cpp)
 ADD_EXECUTABLE(prep-data-metaclust Prep-Data-Metaclust.cpp)
-ADD_EXECUTABLE(lcc LargestCC.cpp)
+#ADD_EXECUTABLE(lcc LargestCC.cpp)
 ADD_EXECUTABLE(inc-pipeline Incremental-Pipeline.cpp)
 ADD_EXECUTABLE(inc-baseline-pipeline Incremental-Baseline-Pipeline.cpp)
 ADD_EXECUTABLE(inc-toy-pipeline Incremental-Toy-Pipeline.cpp)
@@ -20,7 +20,7 @@ TARGET_LINK_LIBRARIES( full CombBLAS )
 TARGET_LINK_LIBRARIES( testideas CombBLAS )
 TARGET_LINK_LIBRARIES( prep-data CombBLAS )
 TARGET_LINK_LIBRARIES( prep-data-metaclust CombBLAS )
-TARGET_LINK_LIBRARIES( lcc CombBLAS )
+#TARGET_LINK_LIBRARIES( lcc CombBLAS )
 TARGET_LINK_LIBRARIES( inc-pipeline CombBLAS )
 TARGET_LINK_LIBRARIES( inc-baseline-pipeline CombBLAS )
 TARGET_LINK_LIBRARIES( inc-toy-pipeline CombBLAS )
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 27cf1521..c03d6043 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.3)
 project(CombBLAS VERSION 2.0.1 LANGUAGES C CXX)
-
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 # require c++14
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED YES)
@@ -18,14 +18,40 @@ else()
   target_compile_features(CombBLAS PUBLIC cxx_return_type_deduction)
 endif()
 
+
 # set include directories
 target_include_directories(CombBLAS PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include>)
 target_include_directories(CombBLAS PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/psort-1.0/include> $<INSTALL_INTERFACE:include>)
 target_include_directories(CombBLAS PRIVATE include/CombBLAS)
 
-# MPI and OpenMP dependencies
+# MPI and OpenMP and CUDA dependencies
 find_package(MPI REQUIRED)
 find_package(OpenMP)
+# This needs to be split based on if CMake >= 3.17, as this is deprecated above that
+find_package(CUDA)
+
+
+if(CUDA_FOUND)
+  #target_compile_definitions(CombBLAS PUBLIC GPU_ENABLED)
+  enable_language(CUDA)
+  #set(CUDA_HOST_COMPILER "nvc++") # NVHPC requires this...sorry if this causes any issues for anyone
+  set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" "--ftemplate-backtrace-limit 1 --expt-relaxed-constexpr --disable-warnings")
+  #FILE(GLOB_RECURSE MyCSources *)
+  #FILE(GLOB_RECURSE MyHSources *.h*)
+  #cset_source_files_properties(${MyCSources} PROPERTIES LANGUAGE CUDA)
+  #set_source_files_properties(src/CommGrid.cpp src/MPIType.cpp src/MPIOp.cpp src/MemoryPool.cpp src/hash.cpp PROPERTIES LANGUAGE CXX)
+  #set_source_files_properties(src/mmio.c PROPERTIES LANGUAGE C)
+  # Hack to avoid using #include
+  #target_link_libraries(CombBLAS PUBLIC ${CUDA_LIBRARIES})
+  #cuda_compile(CUDASpGEMM include/CombBLAS/cudaSpGEMM.cu)
+  #target_sources(CombBLAS PRIVATE ${CUDASpGEMM})
+  set(THREADS_PREFER_PTHREAD_FLAG OFF)
+  set_property(TARGET Threads::Threads
+				 PROPERTY INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler -pthread>
+													"$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-pthread>")
+else()
+  message(STATUS "CUDA features disabled")
+endif()
 
 if(TARGET MPI::MPI_CXX) # Use target if available (CMake >= 3.9)
   target_link_libraries(CombBLAS PUBLIC MPI::MPI_CXX)
@@ -44,6 +70,9 @@ elseif(OPENMP_FOUND)
   target_link_libraries(CombBLAS PUBLIC "${OpenMP_CXX_FLAGS}")
 endif()
 
+
+
+
 add_subdirectory(usort)
 target_link_libraries(CombBLAS PUBLIC Usortlib)
 
@@ -116,6 +145,9 @@ install(
 enable_testing()
 include(CTest)
 
+# Warnings cause the compiler to crash, surpress them to prevent that
+add_definitions(-w)
+
 add_subdirectory(ReleaseTests)
 add_subdirectory(Applications)
 add_subdirectory(Applications/Ordering)
diff --git a/FAQ-combblas-old.html b/FAQ-combblas-old.html
index c52d8b18..74050a3f 100644
--- a/FAQ-combblas-old.html
+++ b/FAQ-combblas-old.html
@@ -1,1397 +1,1397 @@
-﻿<html xmlns:v="urn:schemas-microsoft-com:vml"
-xmlns:o="urn:schemas-microsoft-com:office:office"
-xmlns:w="urn:schemas-microsoft-com:office:word"
-xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
-xmlns:mv="http://macVmlSchemaUri" xmlns="http://www.w3.org/TR/REC-html40">
-
-<head>
-<meta name=Title content="The Combinatorial BLAS Release Notes">
-<meta name=Keywords content="">
-<meta http-equiv=Content-Type content="text/html; charset=unicode">
-<meta name=ProgId content=Word.Document>
-<meta name=Generator content="Microsoft Word 15">
-<meta name=Originator content="Microsoft Word 15">
-<link rel=File-List href="FAQ-combblas.fld/filelist.xml">
-<title>The Combinatorial BLAS Release Notes</title>
-<!--[if gte mso 9]><xml>
- <o:DocumentProperties>
-  <o:Author>Aydin Buluc</o:Author>
-  <o:LastAuthor>Aydin Buluc</o:LastAuthor>
-  <o:Revision>6</o:Revision>
-  <o:TotalTime>8</o:TotalTime>
-  <o:Created>2013-11-27T00:49:00Z</o:Created>
-  <o:LastSaved>2017-10-04T06:29:00Z</o:LastSaved>
-  <o:Pages>1</o:Pages>
-  <o:Words>1905</o:Words>
-  <o:Characters>10861</o:Characters>
-  <o:Lines>90</o:Lines>
-  <o:Paragraphs>25</o:Paragraphs>
-  <o:CharactersWithSpaces>12741</o:CharactersWithSpaces>
-  <o:Version>15.0</o:Version>
- </o:DocumentProperties>
- <o:OfficeDocumentSettings>
-  <o:AllowPNG/>
- </o:OfficeDocumentSettings>
-</xml><![endif]-->
-<link rel=themeData href="FAQ-combblas.fld/themedata.thmx">
-<!--[if gte mso 9]><xml>
- <w:WordDocument>
-  <w:TrackMoves>false</w:TrackMoves>
-  <w:TrackFormatting/>
-  <w:ValidateAgainstSchemas/>
-  <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
-  <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
-  <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
-  <w:DoNotPromoteQF/>
-  <w:LidThemeOther>EN-US</w:LidThemeOther>
-  <w:LidThemeAsian>X-NONE</w:LidThemeAsian>
-  <w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript>
-  <w:Compatibility>
-   <w:SplitPgBreakAndParaMark/>
-  </w:Compatibility>
-  <m:mathPr>
-   <m:mathFont m:val="Cambria Math"/>
-   <m:brkBin m:val="before"/>
-   <m:brkBinSub m:val="&#45;-"/>
-   <m:smallFrac m:val="off"/>
-   <m:dispDef/>
-   <m:lMargin m:val="0"/>
-   <m:rMargin m:val="0"/>
-   <m:defJc m:val="centerGroup"/>
-   <m:wrapIndent m:val="1440"/>
-   <m:intLim m:val="subSup"/>
-   <m:naryLim m:val="undOvr"/>
-  </m:mathPr></w:WordDocument>
-</xml><![endif]--><!--[if gte mso 9]><xml>
- <w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="false"
-  DefSemiHidden="false" DefQFormat="false" DefPriority="99"
-  LatentStyleCount="382">
-  <w:LsdException Locked="false" Priority="0" QFormat="true" Name="Normal"/>
-  <w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 1"/>
-  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
-   UnhideWhenUsed="true" QFormat="true" Name="heading 2"/>
-  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
-   UnhideWhenUsed="true" QFormat="true" Name="heading 3"/>
-  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
-   UnhideWhenUsed="true" QFormat="true" Name="heading 4"/>
-  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
-   UnhideWhenUsed="true" QFormat="true" Name="heading 5"/>
-  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
-   UnhideWhenUsed="true" QFormat="true" Name="heading 6"/>
-  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
-   UnhideWhenUsed="true" QFormat="true" Name="heading 7"/>
-  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
-   UnhideWhenUsed="true" QFormat="true" Name="heading 8"/>
-  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
-   UnhideWhenUsed="true" QFormat="true" Name="heading 9"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="index 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="index 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="index 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="index 4"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="index 5"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="index 6"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="index 7"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="index 8"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="index 9"/>
-  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
-   UnhideWhenUsed="true" Name="toc 1"/>
-  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
-   UnhideWhenUsed="true" Name="toc 2"/>
-  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
-   UnhideWhenUsed="true" Name="toc 3"/>
-  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
-   UnhideWhenUsed="true" Name="toc 4"/>
-  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
-   UnhideWhenUsed="true" Name="toc 5"/>
-  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
-   UnhideWhenUsed="true" Name="toc 6"/>
-  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
-   UnhideWhenUsed="true" Name="toc 7"/>
-  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
-   UnhideWhenUsed="true" Name="toc 8"/>
-  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
-   UnhideWhenUsed="true" Name="toc 9"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Normal Indent"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="footnote text"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="annotation text"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="header"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="footer"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="index heading"/>
-  <w:LsdException Locked="false" Priority="35" SemiHidden="true"
-   UnhideWhenUsed="true" QFormat="true" Name="caption"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="table of figures"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="envelope address"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="envelope return"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="footnote reference"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="annotation reference"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="line number"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="page number"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="endnote reference"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="endnote text"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="table of authorities"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="macro"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="toa heading"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Bullet"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Number"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List 4"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List 5"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Bullet 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Bullet 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Bullet 4"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Bullet 5"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Number 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Number 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Number 4"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Number 5"/>
-  <w:LsdException Locked="false" Priority="10" QFormat="true" Name="Title"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Closing"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Signature"/>
-  <w:LsdException Locked="false" Priority="1" SemiHidden="true"
-   UnhideWhenUsed="true" Name="Default Paragraph Font"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Body Text"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Body Text Indent"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Continue"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Continue 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Continue 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Continue 4"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="List Continue 5"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Message Header"/>
-  <w:LsdException Locked="false" Priority="11" QFormat="true" Name="Subtitle"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Salutation"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Date"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Body Text First Indent"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Body Text First Indent 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Note Heading"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Body Text 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Body Text 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Body Text Indent 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Body Text Indent 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Block Text"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Hyperlink"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="FollowedHyperlink"/>
-  <w:LsdException Locked="false" Priority="22" QFormat="true" Name="Strong"/>
-  <w:LsdException Locked="false" Priority="20" QFormat="true" Name="Emphasis"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Document Map"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Plain Text"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="E-mail Signature"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Top of Form"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Bottom of Form"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Normal (Web)"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Acronym"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Address"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Cite"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Code"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Definition"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Keyboard"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Preformatted"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Sample"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Typewriter"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="HTML Variable"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Normal Table"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="annotation subject"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="No List"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Outline List 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Outline List 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Outline List 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Simple 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Simple 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Simple 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Classic 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Classic 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Classic 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Classic 4"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Colorful 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Colorful 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Colorful 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Columns 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Columns 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Columns 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Columns 4"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Columns 5"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Grid 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Grid 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Grid 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Grid 4"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Grid 5"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Grid 6"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Grid 7"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Grid 8"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table List 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table List 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table List 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table List 4"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table List 5"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table List 6"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table List 7"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table List 8"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table 3D effects 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table 3D effects 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table 3D effects 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Contemporary"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Elegant"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Professional"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Subtle 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Subtle 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Web 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Web 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Web 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Balloon Text"/>
-  <w:LsdException Locked="false" Priority="59" Name="Table Grid"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Table Theme"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Note Level 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Note Level 2"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Note Level 3"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Note Level 4"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Note Level 5"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Note Level 6"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Note Level 7"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Note Level 8"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Note Level 9"/>
-  <w:LsdException Locked="false" SemiHidden="true" Name="Placeholder Text"/>
-  <w:LsdException Locked="false" Priority="1" QFormat="true" Name="No Spacing"/>
-  <w:LsdException Locked="false" Priority="60" Name="Light Shading"/>
-  <w:LsdException Locked="false" Priority="61" Name="Light List"/>
-  <w:LsdException Locked="false" Priority="62" Name="Light Grid"/>
-  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1"/>
-  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2"/>
-  <w:LsdException Locked="false" Priority="65" Name="Medium List 1"/>
-  <w:LsdException Locked="false" Priority="66" Name="Medium List 2"/>
-  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1"/>
-  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2"/>
-  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3"/>
-  <w:LsdException Locked="false" Priority="70" Name="Dark List"/>
-  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading"/>
-  <w:LsdException Locked="false" Priority="72" Name="Colorful List"/>
-  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid"/>
-  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 1"/>
-  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 1"/>
-  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 1"/>
-  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 1"/>
-  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 1"/>
-  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 1"/>
-  <w:LsdException Locked="false" SemiHidden="true" Name="Revision"/>
-  <w:LsdException Locked="false" Priority="34" QFormat="true"
-   Name="List Paragraph"/>
-  <w:LsdException Locked="false" Priority="29" QFormat="true" Name="Quote"/>
-  <w:LsdException Locked="false" Priority="30" QFormat="true"
-   Name="Intense Quote"/>
-  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 1"/>
-  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 1"/>
-  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 1"/>
-  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 1"/>
-  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 1"/>
-  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 1"/>
-  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 1"/>
-  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 1"/>
-  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 2"/>
-  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 2"/>
-  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 2"/>
-  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 2"/>
-  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 2"/>
-  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 2"/>
-  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 2"/>
-  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 2"/>
-  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 2"/>
-  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 2"/>
-  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 2"/>
-  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 2"/>
-  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 2"/>
-  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 2"/>
-  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 3"/>
-  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 3"/>
-  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 3"/>
-  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 3"/>
-  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 3"/>
-  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 3"/>
-  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 3"/>
-  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 3"/>
-  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 3"/>
-  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 3"/>
-  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 3"/>
-  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 3"/>
-  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 3"/>
-  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 3"/>
-  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 4"/>
-  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 4"/>
-  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 4"/>
-  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 4"/>
-  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 4"/>
-  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 4"/>
-  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 4"/>
-  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 4"/>
-  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 4"/>
-  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 4"/>
-  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 4"/>
-  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 4"/>
-  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 4"/>
-  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 4"/>
-  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 5"/>
-  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 5"/>
-  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 5"/>
-  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 5"/>
-  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 5"/>
-  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 5"/>
-  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 5"/>
-  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 5"/>
-  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 5"/>
-  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 5"/>
-  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 5"/>
-  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 5"/>
-  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 5"/>
-  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 5"/>
-  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 6"/>
-  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 6"/>
-  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 6"/>
-  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 6"/>
-  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 6"/>
-  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 6"/>
-  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 6"/>
-  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 6"/>
-  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 6"/>
-  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 6"/>
-  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 6"/>
-  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 6"/>
-  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 6"/>
-  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 6"/>
-  <w:LsdException Locked="false" Priority="19" QFormat="true"
-   Name="Subtle Emphasis"/>
-  <w:LsdException Locked="false" Priority="21" QFormat="true"
-   Name="Intense Emphasis"/>
-  <w:LsdException Locked="false" Priority="31" QFormat="true"
-   Name="Subtle Reference"/>
-  <w:LsdException Locked="false" Priority="32" QFormat="true"
-   Name="Intense Reference"/>
-  <w:LsdException Locked="false" Priority="33" QFormat="true" Name="Book Title"/>
-  <w:LsdException Locked="false" Priority="37" SemiHidden="true"
-   UnhideWhenUsed="true" Name="Bibliography"/>
-  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
-   UnhideWhenUsed="true" QFormat="true" Name="TOC Heading"/>
-  <w:LsdException Locked="false" Priority="41" Name="Plain Table 1"/>
-  <w:LsdException Locked="false" Priority="42" Name="Plain Table 2"/>
-  <w:LsdException Locked="false" Priority="43" Name="Plain Table 3"/>
-  <w:LsdException Locked="false" Priority="44" Name="Plain Table 4"/>
-  <w:LsdException Locked="false" Priority="45" Name="Plain Table 5"/>
-  <w:LsdException Locked="false" Priority="40" Name="Grid Table Light"/>
-  <w:LsdException Locked="false" Priority="46" Name="Grid Table 1 Light"/>
-  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2"/>
-  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3"/>
-  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4"/>
-  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark"/>
-  <w:LsdException Locked="false" Priority="51" Name="Grid Table 6 Colorful"/>
-  <w:LsdException Locked="false" Priority="52" Name="Grid Table 7 Colorful"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="Grid Table 1 Light Accent 1"/>
-  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 1"/>
-  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 1"/>
-  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 1"/>
-  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 1"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="Grid Table 6 Colorful Accent 1"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="Grid Table 7 Colorful Accent 1"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="Grid Table 1 Light Accent 2"/>
-  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 2"/>
-  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 2"/>
-  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 2"/>
-  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 2"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="Grid Table 6 Colorful Accent 2"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="Grid Table 7 Colorful Accent 2"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="Grid Table 1 Light Accent 3"/>
-  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 3"/>
-  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 3"/>
-  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 3"/>
-  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 3"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="Grid Table 6 Colorful Accent 3"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="Grid Table 7 Colorful Accent 3"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="Grid Table 1 Light Accent 4"/>
-  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 4"/>
-  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 4"/>
-  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 4"/>
-  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 4"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="Grid Table 6 Colorful Accent 4"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="Grid Table 7 Colorful Accent 4"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="Grid Table 1 Light Accent 5"/>
-  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 5"/>
-  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 5"/>
-  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 5"/>
-  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 5"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="Grid Table 6 Colorful Accent 5"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="Grid Table 7 Colorful Accent 5"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="Grid Table 1 Light Accent 6"/>
-  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 6"/>
-  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 6"/>
-  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 6"/>
-  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 6"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="Grid Table 6 Colorful Accent 6"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="Grid Table 7 Colorful Accent 6"/>
-  <w:LsdException Locked="false" Priority="46" Name="List Table 1 Light"/>
-  <w:LsdException Locked="false" Priority="47" Name="List Table 2"/>
-  <w:LsdException Locked="false" Priority="48" Name="List Table 3"/>
-  <w:LsdException Locked="false" Priority="49" Name="List Table 4"/>
-  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark"/>
-  <w:LsdException Locked="false" Priority="51" Name="List Table 6 Colorful"/>
-  <w:LsdException Locked="false" Priority="52" Name="List Table 7 Colorful"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="List Table 1 Light Accent 1"/>
-  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 1"/>
-  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 1"/>
-  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 1"/>
-  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 1"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="List Table 6 Colorful Accent 1"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="List Table 7 Colorful Accent 1"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="List Table 1 Light Accent 2"/>
-  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 2"/>
-  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 2"/>
-  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 2"/>
-  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 2"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="List Table 6 Colorful Accent 2"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="List Table 7 Colorful Accent 2"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="List Table 1 Light Accent 3"/>
-  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 3"/>
-  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 3"/>
-  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 3"/>
-  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 3"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="List Table 6 Colorful Accent 3"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="List Table 7 Colorful Accent 3"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="List Table 1 Light Accent 4"/>
-  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 4"/>
-  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 4"/>
-  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 4"/>
-  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 4"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="List Table 6 Colorful Accent 4"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="List Table 7 Colorful Accent 4"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="List Table 1 Light Accent 5"/>
-  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 5"/>
-  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 5"/>
-  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 5"/>
-  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 5"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="List Table 6 Colorful Accent 5"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="List Table 7 Colorful Accent 5"/>
-  <w:LsdException Locked="false" Priority="46"
-   Name="List Table 1 Light Accent 6"/>
-  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 6"/>
-  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 6"/>
-  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 6"/>
-  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 6"/>
-  <w:LsdException Locked="false" Priority="51"
-   Name="List Table 6 Colorful Accent 6"/>
-  <w:LsdException Locked="false" Priority="52"
-   Name="List Table 7 Colorful Accent 6"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Mention"/>
-  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
-   Name="Smart Hyperlink"/>
- </w:LatentStyles>
-</xml><![endif]-->
-<style>
-<!--p.P3
-	{min-height: 14.0px;}
-p.P4
-	{min-height: 14.0px;}
-p.P6
-	{min-height: 15.0px;}
-p.P8
-	{min-height: 18.0px;}
-
- /* Font Definitions */
-@font-face
-	{font-family:Arial;
-	panose-1:2 11 6 4 2 2 2 2 2 4;
-	mso-font-charset:0;
-	mso-generic-font-family:auto;
-	mso-font-pitch:variable;
-	mso-font-signature:-536859905 -1073711037 9 0 511 0;}
-@font-face
-	{font-family:Times;
-	panose-1:2 0 5 0 0 0 0 0 0 0;
-	mso-font-charset:0;
-	mso-generic-font-family:auto;
-	mso-font-pitch:variable;
-	mso-font-signature:3 0 0 0 1 0;}
-@font-face
-	{font-family:"Cambria Math";
-	panose-1:2 4 5 3 5 4 6 3 2 4;
-	mso-font-charset:1;
-	mso-generic-font-family:roman;
-	mso-font-format:other;
-	mso-font-pitch:variable;
-	mso-font-signature:0 0 0 0 0 0;}
- /* Style Definitions */
-p.MsoNormal, li.MsoNormal, div.MsoNormal
-	{mso-style-unhide:no;
-	mso-style-qformat:yes;
-	mso-style-parent:"";
-	margin:0in;
-	margin-bottom:.0001pt;
-	mso-pagination:widow-orphan;
-	font-size:10.0pt;
-	font-family:Times;
-	mso-fareast-font-family:"Times New Roman";
-	mso-fareast-theme-font:minor-fareast;
-	mso-bidi-font-family:"Times New Roman";
-	mso-bidi-theme-font:minor-bidi;}
-h1
-	{mso-style-priority:9;
-	mso-style-unhide:no;
-	mso-style-qformat:yes;
-	mso-style-link:"Heading 1 Char";
-	mso-margin-top-alt:auto;
-	margin-right:0in;
-	mso-margin-bottom-alt:auto;
-	margin-left:0in;
-	mso-pagination:widow-orphan;
-	mso-outline-level:1;
-	font-size:24.0pt;
-	font-family:Times;
-	mso-fareast-font-family:"Times New Roman";
-	mso-fareast-theme-font:minor-fareast;
-	mso-bidi-font-family:"Times New Roman";
-	mso-bidi-theme-font:minor-bidi;}
-a:link, span.MsoHyperlink
-	{mso-style-noshow:yes;
-	mso-style-priority:99;
-	color:blue;
-	text-decoration:underline;
-	text-underline:single;}
-a:visited, span.MsoHyperlinkFollowed
-	{mso-style-noshow:yes;
-	mso-style-priority:99;
-	color:purple;
-	text-decoration:underline;
-	text-underline:single;}
-span.Heading1Char
-	{mso-style-name:"Heading 1 Char";
-	mso-style-priority:9;
-	mso-style-unhide:no;
-	mso-style-locked:yes;
-	mso-style-link:"Heading 1";
-	mso-ansi-font-size:16.0pt;
-	mso-bidi-font-size:16.0pt;
-	font-family:"Calibri Light";
-	mso-ascii-font-family:"Calibri Light";
-	mso-ascii-theme-font:major-latin;
-	mso-fareast-font-family:"Times New Roman";
-	mso-fareast-theme-font:major-fareast;
-	mso-hansi-font-family:"Calibri Light";
-	mso-hansi-theme-font:major-latin;
-	mso-bidi-font-family:"Times New Roman";
-	mso-bidi-theme-font:major-bidi;
-	color:#2D4F8E;
-	mso-themecolor:accent1;
-	mso-themeshade:181;
-	font-weight:bold;}
-p.p2, li.p2, div.p2
-	{mso-style-name:p2;
-	mso-style-unhide:no;
-	margin:0in;
-	margin-bottom:.0001pt;
-	mso-pagination:widow-orphan;
-	font-size:9.0pt;
-	font-family:Arial;
-	mso-fareast-font-family:"Times New Roman";
-	mso-fareast-theme-font:minor-fareast;}
-p.p3, li.p3, div.p3
-	{mso-style-name:p3;
-	mso-style-unhide:no;
-	margin:0in;
-	margin-bottom:.0001pt;
-	mso-pagination:widow-orphan;
-	font-size:9.0pt;
-	font-family:Arial;
-	mso-fareast-font-family:"Times New Roman";
-	mso-fareast-theme-font:minor-fareast;}
-p.p4, li.p4, div.p4
-	{mso-style-name:p4;
-	mso-style-unhide:no;
-	margin:0in;
-	margin-bottom:.0001pt;
-	mso-pagination:widow-orphan;
-	font-size:9.0pt;
-	font-family:Arial;
-	mso-fareast-font-family:"Times New Roman";
-	mso-fareast-theme-font:minor-fareast;
-	color:#232323;}
-p.p5, li.p5, div.p5
-	{mso-style-name:p5;
-	mso-style-unhide:no;
-	margin:0in;
-	margin-bottom:.0001pt;
-	mso-pagination:widow-orphan;
-	font-size:9.0pt;
-	font-family:Arial;
-	mso-fareast-font-family:"Times New Roman";
-	mso-fareast-theme-font:minor-fareast;
-	color:#232323;}
-p.p6, li.p6, div.p6
-	{mso-style-name:p6;
-	mso-style-unhide:no;
-	margin:0in;
-	margin-bottom:.0001pt;
-	mso-pagination:widow-orphan;
-	font-size:10.0pt;
-	font-family:Arial;
-	mso-fareast-font-family:"Times New Roman";
-	mso-fareast-theme-font:minor-fareast;
-	color:#232323;}
-p.p7, li.p7, div.p7
-	{mso-style-name:p7;
-	mso-style-unhide:no;
-	margin:0in;
-	margin-bottom:.0001pt;
-	mso-pagination:widow-orphan;
-	font-size:10.0pt;
-	font-family:Arial;
-	mso-fareast-font-family:"Times New Roman";
-	mso-fareast-theme-font:minor-fareast;
-	color:#232323;}
-p.p8, li.p8, div.p8
-	{mso-style-name:p8;
-	mso-style-unhide:no;
-	margin:0in;
-	margin-bottom:.0001pt;
-	mso-pagination:widow-orphan;
-	font-size:10.5pt;
-	font-family:Times;
-	mso-fareast-font-family:"Times New Roman";
-	mso-fareast-theme-font:minor-fareast;
-	mso-bidi-font-family:"Times New Roman";
-	mso-bidi-theme-font:minor-bidi;}
-span.s1
-	{mso-style-name:s1;
-	mso-style-unhide:no;
-	color:#042EEE;
-	text-decoration:underline;
-	text-underline:single;}
-span.s2
-	{mso-style-name:s2;
-	mso-style-unhide:no;
-	mso-ansi-font-size:9.0pt;
-	mso-bidi-font-size:9.0pt;
-	font-family:Courier;
-	mso-ascii-font-family:Courier;
-	mso-hansi-font-family:Courier;}
-span.s3
-	{mso-style-name:s3;
-	mso-style-unhide:no;
-	color:black;}
-span.s4
-	{mso-style-name:s4;
-	mso-style-unhide:no;
-	color:#232323;}
-span.s5
-	{mso-style-name:s5;
-	mso-style-unhide:no;
-	color:#3D578C;}
-span.s6
-	{mso-style-name:s6;
-	mso-style-unhide:no;
-	color:#0000EE;
-	text-decoration:underline;
-	text-underline:single;}
-span.s7
-	{mso-style-name:s7;
-	mso-style-unhide:no;
-	mso-ansi-font-size:10.0pt;
-	mso-bidi-font-size:10.0pt;
-	font-family:Arial;
-	mso-ascii-font-family:Arial;
-	mso-hansi-font-family:Arial;
-	mso-bidi-font-family:Arial;}
-span.s8
-	{mso-style-name:s8;
-	mso-style-unhide:no;
-	color:black;
-	background:#FBFCFD;}
-span.s9
-	{mso-style-name:s9;
-	mso-style-unhide:no;
-	mso-ansi-font-size:9.0pt;
-	mso-bidi-font-size:9.0pt;
-	font-family:Courier;
-	mso-ascii-font-family:Courier;
-	mso-hansi-font-family:Courier;
-	color:black;
-	background:#FBFCFD;}
-span.s10
-	{mso-style-name:s10;
-	mso-style-unhide:no;
-	mso-ansi-font-size:9.0pt;
-	mso-bidi-font-size:9.0pt;
-	font-family:Courier;
-	mso-ascii-font-family:Courier;
-	mso-hansi-font-family:Courier;
-	color:#4665A2;}
-span.s11
-	{mso-style-name:s11;
-	mso-style-unhide:no;
-	background:#FFFECC;}
-.MsoChpDefault
-	{mso-style-type:export-only;
-	mso-default-props:yes;
-	font-size:10.0pt;
-	mso-ansi-font-size:10.0pt;
-	mso-bidi-font-size:10.0pt;}
-@page WordSection1
-	{size:8.5in 11.0in;
-	margin:1.0in 1.25in 1.0in 1.25in;
-	mso-header-margin:.5in;
-	mso-footer-margin:.5in;
-	mso-paper-source:0;}
-div.WordSection1
-	{page:WordSection1;}
--->
-</style>
-<!--[if gte mso 10]>
-<style>
- /* Style Definitions */
-table.MsoNormalTable
-	{mso-style-name:"Table Normal";
-	mso-tstyle-rowband-size:0;
-	mso-tstyle-colband-size:0;
-	mso-style-noshow:yes;
-	mso-style-priority:99;
-	mso-style-parent:"";
-	mso-padding-alt:0in 5.4pt 0in 5.4pt;
-	mso-para-margin:0in;
-	mso-para-margin-bottom:.0001pt;
-	mso-pagination:widow-orphan;
-	font-size:10.0pt;
-	font-family:"Times New Roman";}
-</style>
-<![endif]-->
-<meta http-equiv=Content-Style-Type content="text/css">
-<meta name=CocoaVersion content=1187.34>
-<!--[if gte mso 9]><xml>
- <o:shapedefaults v:ext="edit" spidmax="1026"/>
-</xml><![endif]--><!--[if gte mso 9]><xml>
- <o:shapelayout v:ext="edit">
-  <o:idmap v:ext="edit" data="1"/>
- </o:shapelayout></xml><![endif]-->
-</head>
-
-<body bgcolor=white lang=EN-US link=blue vlink=purple style='tab-interval:.5in'>
-
-<div class=WordSection1>
-
-<h1 style='margin-top:0in;margin-right:0in;margin-bottom:12.0pt;margin-left:
-0in'><span style='font-size:18.0pt;mso-fareast-font-family:"Times New Roman";
-mso-bidi-font-family:"Times New Roman"'>Frequently Asked Questions about
-Combinatorial BLAS<o:p></o:p></span></h1>
-
-<p class=p2>Go <a
-href="http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/index.html"><span class=s1>back</span></a>
-to the the Combinatorial BLAS home page.</p>
-
-<p class=p3><o:p>&nbsp;</o:p></p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q1: </b>I would like to use your Combinatorial BLAS code for
-some of my experiments which involve sparse matrix multiplication. However, it
-is not clear how to write an output of <span class=s2>PSpGEMM(…)</span>
-function to a file. I've tried to use &quot;put&quot; function of SpParMat, but
-it outputs part of the matrix that corresponds to particular process and not
-the whole matrix. Is there a way to do it using your code?</p>
-
-<p class=p3><o:p>&nbsp;</o:p></p>
-
-<p class=p2><span class=s3><b>A1:</b> Yes, </span><span class=s2>SpParMat::SaveGathered(…)</span>
-will create a single file output (albeit slow) sorted with increasing row id's
-when called like<span class=s2> A.SaveGathered(&quot;product.txt&quot;)</span>.
-The caveat is that<span class=apple-converted-space><span style='font-family:
-Arial'>&nbsp; </span></span>&quot;gathered&quot; I/O in human readable form is
-quite slow due to serialization on large processor counts. It should only be
-used for debugging, ideally.<span class=apple-converted-space><span
-style='font-family:Arial'>&nbsp;For vectors, </span></span>we have a much much
-faster version:<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;
-</span></span><span class=s2>FullyDistVec::ParallelWrite (…), </span>which
-should be used instead. SpParMat will also get a ParallelWrite soon.</p>
-
-<p class=p7>----</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q2</b>: Does Combinatorial BLAS support in-node multithreading?<span
-class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p2><span class=s4><b>A2:</b> </span>Almost all expensive primitives
-(SpGEMM, SpMV with sparse vectors, SpMV with dense vectors, EWiseMult, Apply,
-Set) are hybrid multithreaded within a socket. Read this <a
-href="http://gauss.cs.ucsb.edu/~aydin/CombBLAS_FILES/CombBLASv1.2_threading.txt"><span
-class=s5>example</span><span class=s6>.</span></a> </p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q3:</b> Reading/writing text files is really slow for my
-purposes, what can I do?</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p2><span class=s4><b>A3:</b> </span>Starting from version 1.6, we now have
-extremely fast matrix market (text) file reading, check out
-SpParMat::ParallelReadMM() and <span class=s2>FullyDistVec::ParallelReadMM()</span></p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q4:</b> Is there a preferred way to prune elements from a
-SpParMat according to a predicate?</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>A4:</b> Yes,<span class=s7><span style='font-size:10.0pt'> </span></span><span
-class=s2>SpParMat::Prune(…)</span> will do it according to a predicate. An
-overloaded version of the same function, <span class=s2>SpParMat::Prune(ri,ci)</span>
-will <span class=s8>prune all entries whose row indices are in </span><span
-class=s9>ri </span><span class=s8>and column indices are in </span><span
-class=s9>ci</span></p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q5:</b> I am trying to run CombBLAS on Windows but the MS MPI does
-not seem to support MPI C++ bindings.</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p2><b>A5: </b>Combinatorial BLAS recently (version 1.3.0) switched to
-C-API for all its internal MPI calls. After that, we've also compiled CombLAS
-on a windows machine without problems. However, we recommend using an open
-source MPI for windows too, such as MPICH-2.</p>
-
-<p class=p3><o:p>&nbsp;</o:p></p>
-
-<p class=p2>---</p>
-
-<p class=p3><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q6:</b> I would like to use Combinatorial BLAS for some parallel
-sparse matrix-matrix multiplications. This works quite well, however when I try
-to assign a m x 1 sparse matrix (actually a vector) to the first column of an
-existing sparse matrix with SpAsgn I get an error saying: &quot;Matrix is too
-small to be splitted&quot;. Is this because it's not possible to use SpAsgn on
-vector-like matrices?</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>A6: </b>SpAsgn internally uses a memory efficient <span
-class=s2>Mult_AnXBn_DoubleBuff</span> as opposed to&nbsp;<span class=s2>Mult_AnXBn_Synch</span>).
-You might probably go into&nbsp;<a
-href="http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/class_sp_par_mat.html#ae510b0084843dc7d7c7d5c6572f5ef12"><span
-class=s10>SpParMat&lt;IT,NT,DER&gt;::SpAsgn</span></a><span class=s2>(...)</span>
-and change occuranges of&nbsp;<span class=s2>Mult_AnXBn_DoubleBuff</span>
-to&nbsp;<span class=s2>Mult_AnXBn_Synch</span>.&nbsp;However, this will likely
-only solve your problem for the serial case. because ComBBLAS can not
-effectively 2D decompose an m x 1 matrix: each dimension should ideally be at
-least sqrt(p).&nbsp;It is much better to represent that vector as a <span
-class=s2>FullyDistSpVec</span>.</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q7: </b>Starting from a general sparse matrix Z, I want to
-construct the symmetric matrix M: [[X'X; X'Z];[Z'X; Z'Z]], where X is a vector
-of 1's. Thus the element at position (1,1) is simply the number of columns of
-Z, and the vector Z'X contains the sums per column of Z. For now, I have a
-working code, but it is quite sloppy because I do not find a function for which
-I can easily increase the dimension of a sparse matrix or even set an element
-to a specific value. Is there any function in Combinatorial BLAS which can do
-this?<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>A7:</b> Not out of the box. You don't want an SpAsgn or any
-variant of it because it can't grow the matrix. You want some sort of matrix
-append. How about using Find(…) and Sparse(…)?&nbsp; The Matlab care of what
-you want to do is:</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5>X = ones(size(Z,2),1)&nbsp;</p>
-
-<p class=p5>M = [X' * X, X' * Z; Z'* X, Z' * Z]</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5>Supporting such a general concatenation efficiently might be hard
-to add at this point. Instead,<span class=apple-converted-space><span
-style='font-family:Arial'>&nbsp; </span></span>there is a Concatenate(…)
-function for vectors. Armed with Concatenate(…), find(), and the sparse-like
-constructor, one can solve your problem. &nbsp;Check out the working example in
-ReleaseTests/FindSparse.cpp</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q8:</b> Does CombBLAS include the API to perform a symmetric
-permutation on a matrix, as explained in your <a
-href="http://gauss.cs.ucsb.edu/~aydin/spgemm_sisc12.pdf"><span class=s6>SISC
-paper</span></a>?&nbsp;</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>A8:</b> Yes it does. Check out the
-ReleaseTests/IndexingTiming.cpp for an example.</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q9:</b> How can I use small test case to see whether the
-operation on matrix is correct? In other words, how do I print all the
-information of a matrix with each value in matrix?&nbsp;</p>
-
-<p class=p5>I can use PrintInfo to print basic information, but it only gives
-me number of rows and columns and nnz</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p2><span class=s4><b>A9:</b> Our recommendation is </span>to use <span
-class=s2>SaveGathered(…) </span>to dump the whole matrix into a file in triples
-(matrix market) format. For vectors, we have a much much faster version:<span
-class=apple-converted-space><span style='font-family:Arial'>&nbsp; </span></span><span
-class=s2>FullyDistVec::ParallelWrite (…)</span></p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p5><span class=s3><b>Q10:</b> </span>Does CombBLAS code run on any
-graph size or there is some limitation on the dimension of the matrix A. I mean
-should it be a multiple of sqrt(p) where p is total number of processors.&nbsp;</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>A10:</b> No, the matrix dimension does not have to be a multiple
-of sqrt(p) but it should be bigger than sqrt(p). In other words you can have a
-5x5 matrix on 4 processors but not on 36 processors. We don't really see the
-point of using more than |V|^2 processors.</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5>---</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q11:</b> My comparison results on real graph inputs revealed
-something weird. In input loc-gowalla, how can 16 processors time(called
-time_16) and&nbsp;</p>
-
-<p class=p5>64 processors time(called time_64) which time_64*4&lt;time_16
-&nbsp;which is more than linear scale?&nbsp;</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>A11:</b> The complexity of the parallel algorithm drops as
-sub-matrices owned by each processor gets sparser. In particular, it is
-proportional to O(flops x log(ni)) where ni is the size of the intersection of
-the set of nonzero columns of Aik and nonzero rows of Bkj for A*B. What might
-happen as p increases is that there is a phase transition that makes ni drop
-significantly for your input (for p=64, each sub-matrix will have only ~1.2
-nonzeros per row or column). More details are in the <a
-href="http://gauss.cs.ucsb.edu/~aydin/spgemm_sisc12.pdf"><span class=s6>SISC
-paper</span></a> and the references therein. Hope this makes sense. This is why
-I don't suggest people use CombBLAS for small p (&lt; 40) because it is not on
-the top of its game for small number of processors.&nbsp;</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5>---<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q12:</b> Should the input file have nodes numbered from 1 or it
-is fine if the nodes are numbered from 0?</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>A12:</b> If you're using the human readable matrix market format
-as your input, then it should be 1-indexed.<span class=apple-converted-space><span
-style='font-family:Arial'>&nbsp;</span></span></p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q13: </b>I'm wondering for breadth-first-search, under the hood
-does the matrix-vector multiplication method change based on the sparsity of
-the frontier vector, or does the underlying matrix-vector multiplication assume
-the frontier is always sparse?</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>A13:</b> Depending on your definition of sparseness, the
-frontier is almost always sparse. We use the pragmatic definition of
-&quot;sparse&quot; in the sense that a vector is sparse if it is worth taking
-advantage of the sparsity in there. I'd guess, for a dense vector assumption to
-be competitive, it would have to have at least 1/3 of its potential locations
-nonzero. However, I might be wrong (and you're welcome to prove me wrong). To
-answer your question more directly, CombBLAS supports both dense and sparse
-right hand side vectors, but the specific BFS implementation does not
-adapt.&nbsp;</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q14: </b>Could you briefly explain the difference in your
-implementations of matrix-sparse vector and matrix-dense vector multiply? For
-example, is the sparse vector case a write-based approach: Every element
-updates all of its neighbors (from a graph-theoretic standpoint) locations in
-the output vector; and the dense vector case a read-based approach: Every
-element reads some value from each of its neighbors and updates its own entry
-in the resulting vector?</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>A14:</b> Sparse matrix-sparse vector is &quot;right hand side
-vector structure&quot; driven. In y = A*x, for each nonzero x_i, we scale the
-column A(:,i) with that and merge the scaled sparse columns results into y. The
-computation boils down into merging sparse columns into one. <span class=s11>Combinatorial</span>
-<span class=s11>BLAS</span> is a matrix-vector based library, so thinking in
-terms of updates on single entries is probably not the right abstraction.</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5>Sparse matrix-dense vector is slightly different in the sense that
-it is driven by the matrix structure; you basically stream the matrix. The
-correctness of both operations are handled by a SPA-like or heap-like data
-structure that merges multiple intermediate values contributing to the same
-output location; no atomics are used.</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
-
-<p class=p3><o:p>&nbsp;</o:p></p>
-
-<p class=p5><span class=s3><b>Q15: </b>I </span>would like to get your opinion
-on how sparse-matrix based implementations compare with more native
-implementations</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><span class=s3><b>A15: </b></span>Sparse matrix abstraction, like
-any abstraction, will leave some performance on the table. In particular it is
-prone to performing extra passes over data or creating extra temporaries (if
-you've ever programmed in Matlab; this is similar). On the other hand, sparse
-matrix abstraction gives you &quot;primitives&quot; to implement graph
-&quot;algorithms&quot; as opposed to the algorithms themselves. For instance,
-CombBLAS has sparse matrix x sparse vector over a semiring as opposed to BFS,
-because now using the same primitive one can implement MIS (maximal independent
-set) too, only by changing the&nbsp;semiring. Or one can perform run time
-filtering on edges based on the attributes, similarly by changing the semiring
-functions (therefore extending functionality to semantic graphs). Indeed this
-is what we've done in our upcoming IPDPS'13 paper.</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p2>---</p>
-
-<p class=p3><o:p>&nbsp;</o:p></p>
-
-<p class=p2><b>Q16: </b>Is there an effort to incorporate the bottom-up BFS of
-Scott Beamer into CombBLAS?</p>
-
-<p class=p3><o:p>&nbsp;</o:p></p>
-
-<p class=p2><b>A16: </b>Yes, it is already done. Just use the dobfs executable
-(made from DirOptBFS.cpp).</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>Q17: </b>My serial code is faster than CombBLAS on a single
-core.</p>
-
-<p class=p4><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b>A17: </b>I believe that. CombBLAS targets
-&quot;scalability&quot;, not optimizing the single core performance.</p>
-
-<p class=p5>&nbsp;</p>
-
-<p class=p5>Examples:</p>
-
-<p class=p5>- think about the 2D BFS. CombBLAS does not use a CSR like data
-structure because that is not memory scalable due to problems of <a
-href="http://gauss.cs.ucsb.edu/publication/hypersparse-ipdps08.pdf"><span
-class=s6>hypersparsity</span></a> in large concurrencies. Instead
-CombBLAS&nbsp;opts to use a slower (about 2x around 1000 cores) but memory
-scalable format called DCSC. &nbsp;</p>
-
-<p class=p5>- think about betweenness centrality which uses sparse
-matrix-matrix multiply. CombBLAS doesn't use the fastest serial algorithm as
-its subroutine because it doesn't &nbsp;scale to thousands of cores. Instead it
-uses a outer-product algorithm that is significantly slower for p=1, but scales
-indefinitely.</p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p7>---<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
-
-<p class=p6><o:p>&nbsp;</o:p></p>
-
-<p class=p2><b>Q18:</b> Looking at the output of your Graph500 application, I
-noticed a large number of self-edges removed. That’s very interesting.</p>
-
-<p class=p3><o:p>&nbsp;</o:p></p>
-
-<p class=p2><b>A18: </b>The duplicate edges problem is inherent to the R-MAT
-generator on large scale, unless some special kind of noise is added. Check
-here for a great analysis of this phenomenon: <a
-href="http://arxiv.org/abs/1102.5046"><span class=s6>http://arxiv.org/abs/1102.5046</span></a></p>
-
-<p class=p3><o:p>&nbsp;</o:p></p>
-
-<p class=p2>---</p>
-
-<p class=p3><o:p>&nbsp;</o:p></p>
-
-<p class=p2><b>Q19:</b> How are you counting the number of edges traversed in
-Graph500? Is this still using the original verify.c file provided with the
-reference version of the Graph500 benchmark and passing in the parent tree?</p>
-
-<p class=p3><o:p>&nbsp;</o:p></p>
-
-<p class=p5><span class=s3><b>A19: </b></span>It is calculated by summing the
-degrees of the discovered vertices using <span class=s2>EWiseMult(…)</span>
-followed by a <span class=s2>Reduce(…)</span>. Degrees are pre-symmetrization
-(original edges), so we're not over-counting. However, we count self-loops and
-duplicates as mentioned in the benchmark specs.</p>
-
-<p class=p5><o:p>&nbsp;</o:p></p>
-
-<p class=p5>---</p>
-
-<p class=p5><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b style='mso-bidi-font-weight:normal'>Q20:</b> My computations
-finishes fine but I get an “Attempting to use an MPI routine after finalizing
-MPICH” afterwards.</p>
-
-<p class=p5><o:p>&nbsp;</o:p></p>
-
-<p class=p5><b style='mso-bidi-font-weight:normal'>A20:</b> To avoid the
-finalization error, please imitate an example such as MultTest.cpp: <a
-href="http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/_mult_test_8cpp_source.html">http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/_mult_test_8cpp_source.html</a></p>
-
-<p class=p5>The curly brackets around the code are intentional. Since
-distributed objects have MPI related pointers in them, those pointers are
-released once the destructors are called. In C++ (at least until C++11) there
-isn’t a good way to call the destructor manually, so the destructor is called
-immediately before the program exists, which is after the MPI_Finalize. Since
-the MPI related objects are destructed after MPI_Finalize, you see this error.
-Try the curly brackets approach.</p>
-
-<p class=p8><span style='mso-bidi-font-family:"Times New Roman"'><o:p>&nbsp;</o:p></span></p>
-
-<p class=p2>Go <a
-href="http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/index.html"><span class=s6>back</span></a>
-to the the Combinatorial BLAS home page.</p>
-
-</div>
-
-</body>
-
-</html>
+﻿<html xmlns:v="urn:schemas-microsoft-com:vml"
+xmlns:o="urn:schemas-microsoft-com:office:office"
+xmlns:w="urn:schemas-microsoft-com:office:word"
+xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
+xmlns:mv="http://macVmlSchemaUri" xmlns="http://www.w3.org/TR/REC-html40">
+
+<head>
+<meta name=Title content="The Combinatorial BLAS Release Notes">
+<meta name=Keywords content="">
+<meta http-equiv=Content-Type content="text/html; charset=unicode">
+<meta name=ProgId content=Word.Document>
+<meta name=Generator content="Microsoft Word 15">
+<meta name=Originator content="Microsoft Word 15">
+<link rel=File-List href="FAQ-combblas.fld/filelist.xml">
+<title>The Combinatorial BLAS Release Notes</title>
+<!--[if gte mso 9]><xml>
+ <o:DocumentProperties>
+  <o:Author>Aydin Buluc</o:Author>
+  <o:LastAuthor>Aydin Buluc</o:LastAuthor>
+  <o:Revision>6</o:Revision>
+  <o:TotalTime>8</o:TotalTime>
+  <o:Created>2013-11-27T00:49:00Z</o:Created>
+  <o:LastSaved>2017-10-04T06:29:00Z</o:LastSaved>
+  <o:Pages>1</o:Pages>
+  <o:Words>1905</o:Words>
+  <o:Characters>10861</o:Characters>
+  <o:Lines>90</o:Lines>
+  <o:Paragraphs>25</o:Paragraphs>
+  <o:CharactersWithSpaces>12741</o:CharactersWithSpaces>
+  <o:Version>15.0</o:Version>
+ </o:DocumentProperties>
+ <o:OfficeDocumentSettings>
+  <o:AllowPNG/>
+ </o:OfficeDocumentSettings>
+</xml><![endif]-->
+<link rel=themeData href="FAQ-combblas.fld/themedata.thmx">
+<!--[if gte mso 9]><xml>
+ <w:WordDocument>
+  <w:TrackMoves>false</w:TrackMoves>
+  <w:TrackFormatting/>
+  <w:ValidateAgainstSchemas/>
+  <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
+  <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
+  <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
+  <w:DoNotPromoteQF/>
+  <w:LidThemeOther>EN-US</w:LidThemeOther>
+  <w:LidThemeAsian>X-NONE</w:LidThemeAsian>
+  <w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript>
+  <w:Compatibility>
+   <w:SplitPgBreakAndParaMark/>
+  </w:Compatibility>
+  <m:mathPr>
+   <m:mathFont m:val="Cambria Math"/>
+   <m:brkBin m:val="before"/>
+   <m:brkBinSub m:val="&#45;-"/>
+   <m:smallFrac m:val="off"/>
+   <m:dispDef/>
+   <m:lMargin m:val="0"/>
+   <m:rMargin m:val="0"/>
+   <m:defJc m:val="centerGroup"/>
+   <m:wrapIndent m:val="1440"/>
+   <m:intLim m:val="subSup"/>
+   <m:naryLim m:val="undOvr"/>
+  </m:mathPr></w:WordDocument>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+ <w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="false"
+  DefSemiHidden="false" DefQFormat="false" DefPriority="99"
+  LatentStyleCount="382">
+  <w:LsdException Locked="false" Priority="0" QFormat="true" Name="Normal"/>
+  <w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 1"/>
+  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
+   UnhideWhenUsed="true" QFormat="true" Name="heading 2"/>
+  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
+   UnhideWhenUsed="true" QFormat="true" Name="heading 3"/>
+  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
+   UnhideWhenUsed="true" QFormat="true" Name="heading 4"/>
+  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
+   UnhideWhenUsed="true" QFormat="true" Name="heading 5"/>
+  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
+   UnhideWhenUsed="true" QFormat="true" Name="heading 6"/>
+  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
+   UnhideWhenUsed="true" QFormat="true" Name="heading 7"/>
+  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
+   UnhideWhenUsed="true" QFormat="true" Name="heading 8"/>
+  <w:LsdException Locked="false" Priority="9" SemiHidden="true"
+   UnhideWhenUsed="true" QFormat="true" Name="heading 9"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="index 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="index 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="index 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="index 4"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="index 5"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="index 6"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="index 7"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="index 8"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="index 9"/>
+  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
+   UnhideWhenUsed="true" Name="toc 1"/>
+  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
+   UnhideWhenUsed="true" Name="toc 2"/>
+  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
+   UnhideWhenUsed="true" Name="toc 3"/>
+  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
+   UnhideWhenUsed="true" Name="toc 4"/>
+  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
+   UnhideWhenUsed="true" Name="toc 5"/>
+  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
+   UnhideWhenUsed="true" Name="toc 6"/>
+  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
+   UnhideWhenUsed="true" Name="toc 7"/>
+  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
+   UnhideWhenUsed="true" Name="toc 8"/>
+  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
+   UnhideWhenUsed="true" Name="toc 9"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Normal Indent"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="footnote text"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="annotation text"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="header"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="footer"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="index heading"/>
+  <w:LsdException Locked="false" Priority="35" SemiHidden="true"
+   UnhideWhenUsed="true" QFormat="true" Name="caption"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="table of figures"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="envelope address"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="envelope return"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="footnote reference"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="annotation reference"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="line number"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="page number"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="endnote reference"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="endnote text"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="table of authorities"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="macro"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="toa heading"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Bullet"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Number"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List 4"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List 5"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Bullet 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Bullet 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Bullet 4"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Bullet 5"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Number 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Number 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Number 4"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Number 5"/>
+  <w:LsdException Locked="false" Priority="10" QFormat="true" Name="Title"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Closing"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Signature"/>
+  <w:LsdException Locked="false" Priority="1" SemiHidden="true"
+   UnhideWhenUsed="true" Name="Default Paragraph Font"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Body Text"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Body Text Indent"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Continue"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Continue 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Continue 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Continue 4"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="List Continue 5"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Message Header"/>
+  <w:LsdException Locked="false" Priority="11" QFormat="true" Name="Subtitle"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Salutation"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Date"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Body Text First Indent"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Body Text First Indent 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Note Heading"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Body Text 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Body Text 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Body Text Indent 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Body Text Indent 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Block Text"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Hyperlink"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="FollowedHyperlink"/>
+  <w:LsdException Locked="false" Priority="22" QFormat="true" Name="Strong"/>
+  <w:LsdException Locked="false" Priority="20" QFormat="true" Name="Emphasis"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Document Map"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Plain Text"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="E-mail Signature"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Top of Form"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Bottom of Form"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Normal (Web)"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Acronym"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Address"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Cite"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Code"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Definition"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Keyboard"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Preformatted"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Sample"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Typewriter"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="HTML Variable"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Normal Table"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="annotation subject"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="No List"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Outline List 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Outline List 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Outline List 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Simple 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Simple 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Simple 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Classic 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Classic 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Classic 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Classic 4"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Colorful 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Colorful 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Colorful 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Columns 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Columns 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Columns 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Columns 4"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Columns 5"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Grid 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Grid 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Grid 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Grid 4"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Grid 5"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Grid 6"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Grid 7"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Grid 8"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table List 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table List 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table List 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table List 4"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table List 5"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table List 6"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table List 7"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table List 8"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table 3D effects 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table 3D effects 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table 3D effects 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Contemporary"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Elegant"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Professional"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Subtle 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Subtle 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Web 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Web 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Web 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Balloon Text"/>
+  <w:LsdException Locked="false" Priority="59" Name="Table Grid"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Table Theme"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Note Level 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Note Level 2"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Note Level 3"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Note Level 4"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Note Level 5"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Note Level 6"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Note Level 7"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Note Level 8"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Note Level 9"/>
+  <w:LsdException Locked="false" SemiHidden="true" Name="Placeholder Text"/>
+  <w:LsdException Locked="false" Priority="1" QFormat="true" Name="No Spacing"/>
+  <w:LsdException Locked="false" Priority="60" Name="Light Shading"/>
+  <w:LsdException Locked="false" Priority="61" Name="Light List"/>
+  <w:LsdException Locked="false" Priority="62" Name="Light Grid"/>
+  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1"/>
+  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2"/>
+  <w:LsdException Locked="false" Priority="65" Name="Medium List 1"/>
+  <w:LsdException Locked="false" Priority="66" Name="Medium List 2"/>
+  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1"/>
+  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2"/>
+  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3"/>
+  <w:LsdException Locked="false" Priority="70" Name="Dark List"/>
+  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading"/>
+  <w:LsdException Locked="false" Priority="72" Name="Colorful List"/>
+  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid"/>
+  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 1"/>
+  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 1"/>
+  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 1"/>
+  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 1"/>
+  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 1"/>
+  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 1"/>
+  <w:LsdException Locked="false" SemiHidden="true" Name="Revision"/>
+  <w:LsdException Locked="false" Priority="34" QFormat="true"
+   Name="List Paragraph"/>
+  <w:LsdException Locked="false" Priority="29" QFormat="true" Name="Quote"/>
+  <w:LsdException Locked="false" Priority="30" QFormat="true"
+   Name="Intense Quote"/>
+  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 1"/>
+  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 1"/>
+  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 1"/>
+  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 1"/>
+  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 1"/>
+  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 1"/>
+  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 1"/>
+  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 1"/>
+  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 2"/>
+  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 2"/>
+  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 2"/>
+  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 2"/>
+  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 2"/>
+  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 2"/>
+  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 2"/>
+  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 2"/>
+  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 2"/>
+  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 2"/>
+  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 2"/>
+  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 2"/>
+  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 2"/>
+  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 2"/>
+  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 3"/>
+  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 3"/>
+  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 3"/>
+  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 3"/>
+  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 3"/>
+  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 3"/>
+  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 3"/>
+  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 3"/>
+  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 3"/>
+  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 3"/>
+  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 3"/>
+  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 3"/>
+  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 3"/>
+  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 3"/>
+  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 4"/>
+  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 4"/>
+  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 4"/>
+  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 4"/>
+  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 4"/>
+  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 4"/>
+  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 4"/>
+  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 4"/>
+  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 4"/>
+  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 4"/>
+  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 4"/>
+  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 4"/>
+  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 4"/>
+  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 4"/>
+  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 5"/>
+  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 5"/>
+  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 5"/>
+  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 5"/>
+  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 5"/>
+  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 5"/>
+  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 5"/>
+  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 5"/>
+  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 5"/>
+  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 5"/>
+  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 5"/>
+  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 5"/>
+  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 5"/>
+  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 5"/>
+  <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 6"/>
+  <w:LsdException Locked="false" Priority="61" Name="Light List Accent 6"/>
+  <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 6"/>
+  <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 6"/>
+  <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 6"/>
+  <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 6"/>
+  <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 6"/>
+  <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 6"/>
+  <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 6"/>
+  <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 6"/>
+  <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 6"/>
+  <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 6"/>
+  <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 6"/>
+  <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 6"/>
+  <w:LsdException Locked="false" Priority="19" QFormat="true"
+   Name="Subtle Emphasis"/>
+  <w:LsdException Locked="false" Priority="21" QFormat="true"
+   Name="Intense Emphasis"/>
+  <w:LsdException Locked="false" Priority="31" QFormat="true"
+   Name="Subtle Reference"/>
+  <w:LsdException Locked="false" Priority="32" QFormat="true"
+   Name="Intense Reference"/>
+  <w:LsdException Locked="false" Priority="33" QFormat="true" Name="Book Title"/>
+  <w:LsdException Locked="false" Priority="37" SemiHidden="true"
+   UnhideWhenUsed="true" Name="Bibliography"/>
+  <w:LsdException Locked="false" Priority="39" SemiHidden="true"
+   UnhideWhenUsed="true" QFormat="true" Name="TOC Heading"/>
+  <w:LsdException Locked="false" Priority="41" Name="Plain Table 1"/>
+  <w:LsdException Locked="false" Priority="42" Name="Plain Table 2"/>
+  <w:LsdException Locked="false" Priority="43" Name="Plain Table 3"/>
+  <w:LsdException Locked="false" Priority="44" Name="Plain Table 4"/>
+  <w:LsdException Locked="false" Priority="45" Name="Plain Table 5"/>
+  <w:LsdException Locked="false" Priority="40" Name="Grid Table Light"/>
+  <w:LsdException Locked="false" Priority="46" Name="Grid Table 1 Light"/>
+  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2"/>
+  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3"/>
+  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4"/>
+  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark"/>
+  <w:LsdException Locked="false" Priority="51" Name="Grid Table 6 Colorful"/>
+  <w:LsdException Locked="false" Priority="52" Name="Grid Table 7 Colorful"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="Grid Table 1 Light Accent 1"/>
+  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 1"/>
+  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 1"/>
+  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 1"/>
+  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 1"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="Grid Table 6 Colorful Accent 1"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="Grid Table 7 Colorful Accent 1"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="Grid Table 1 Light Accent 2"/>
+  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 2"/>
+  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 2"/>
+  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 2"/>
+  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 2"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="Grid Table 6 Colorful Accent 2"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="Grid Table 7 Colorful Accent 2"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="Grid Table 1 Light Accent 3"/>
+  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 3"/>
+  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 3"/>
+  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 3"/>
+  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 3"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="Grid Table 6 Colorful Accent 3"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="Grid Table 7 Colorful Accent 3"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="Grid Table 1 Light Accent 4"/>
+  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 4"/>
+  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 4"/>
+  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 4"/>
+  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 4"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="Grid Table 6 Colorful Accent 4"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="Grid Table 7 Colorful Accent 4"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="Grid Table 1 Light Accent 5"/>
+  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 5"/>
+  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 5"/>
+  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 5"/>
+  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 5"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="Grid Table 6 Colorful Accent 5"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="Grid Table 7 Colorful Accent 5"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="Grid Table 1 Light Accent 6"/>
+  <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 6"/>
+  <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 6"/>
+  <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 6"/>
+  <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 6"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="Grid Table 6 Colorful Accent 6"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="Grid Table 7 Colorful Accent 6"/>
+  <w:LsdException Locked="false" Priority="46" Name="List Table 1 Light"/>
+  <w:LsdException Locked="false" Priority="47" Name="List Table 2"/>
+  <w:LsdException Locked="false" Priority="48" Name="List Table 3"/>
+  <w:LsdException Locked="false" Priority="49" Name="List Table 4"/>
+  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark"/>
+  <w:LsdException Locked="false" Priority="51" Name="List Table 6 Colorful"/>
+  <w:LsdException Locked="false" Priority="52" Name="List Table 7 Colorful"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="List Table 1 Light Accent 1"/>
+  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 1"/>
+  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 1"/>
+  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 1"/>
+  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 1"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="List Table 6 Colorful Accent 1"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="List Table 7 Colorful Accent 1"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="List Table 1 Light Accent 2"/>
+  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 2"/>
+  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 2"/>
+  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 2"/>
+  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 2"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="List Table 6 Colorful Accent 2"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="List Table 7 Colorful Accent 2"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="List Table 1 Light Accent 3"/>
+  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 3"/>
+  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 3"/>
+  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 3"/>
+  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 3"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="List Table 6 Colorful Accent 3"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="List Table 7 Colorful Accent 3"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="List Table 1 Light Accent 4"/>
+  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 4"/>
+  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 4"/>
+  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 4"/>
+  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 4"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="List Table 6 Colorful Accent 4"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="List Table 7 Colorful Accent 4"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="List Table 1 Light Accent 5"/>
+  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 5"/>
+  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 5"/>
+  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 5"/>
+  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 5"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="List Table 6 Colorful Accent 5"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="List Table 7 Colorful Accent 5"/>
+  <w:LsdException Locked="false" Priority="46"
+   Name="List Table 1 Light Accent 6"/>
+  <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 6"/>
+  <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 6"/>
+  <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 6"/>
+  <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 6"/>
+  <w:LsdException Locked="false" Priority="51"
+   Name="List Table 6 Colorful Accent 6"/>
+  <w:LsdException Locked="false" Priority="52"
+   Name="List Table 7 Colorful Accent 6"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Mention"/>
+  <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
+   Name="Smart Hyperlink"/>
+ </w:LatentStyles>
+</xml><![endif]-->
+<style>
+<!--p.P3
+	{min-height: 14.0px;}
+p.P4
+	{min-height: 14.0px;}
+p.P6
+	{min-height: 15.0px;}
+p.P8
+	{min-height: 18.0px;}
+
+ /* Font Definitions */
+@font-face
+	{font-family:Arial;
+	panose-1:2 11 6 4 2 2 2 2 2 4;
+	mso-font-charset:0;
+	mso-generic-font-family:auto;
+	mso-font-pitch:variable;
+	mso-font-signature:-536859905 -1073711037 9 0 511 0;}
+@font-face
+	{font-family:Times;
+	panose-1:2 0 5 0 0 0 0 0 0 0;
+	mso-font-charset:0;
+	mso-generic-font-family:auto;
+	mso-font-pitch:variable;
+	mso-font-signature:3 0 0 0 1 0;}
+@font-face
+	{font-family:"Cambria Math";
+	panose-1:2 4 5 3 5 4 6 3 2 4;
+	mso-font-charset:1;
+	mso-generic-font-family:roman;
+	mso-font-format:other;
+	mso-font-pitch:variable;
+	mso-font-signature:0 0 0 0 0 0;}
+ /* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+	{mso-style-unhide:no;
+	mso-style-qformat:yes;
+	mso-style-parent:"";
+	margin:0in;
+	margin-bottom:.0001pt;
+	mso-pagination:widow-orphan;
+	font-size:10.0pt;
+	font-family:Times;
+	mso-fareast-font-family:"Times New Roman";
+	mso-fareast-theme-font:minor-fareast;
+	mso-bidi-font-family:"Times New Roman";
+	mso-bidi-theme-font:minor-bidi;}
+h1
+	{mso-style-priority:9;
+	mso-style-unhide:no;
+	mso-style-qformat:yes;
+	mso-style-link:"Heading 1 Char";
+	mso-margin-top-alt:auto;
+	margin-right:0in;
+	mso-margin-bottom-alt:auto;
+	margin-left:0in;
+	mso-pagination:widow-orphan;
+	mso-outline-level:1;
+	font-size:24.0pt;
+	font-family:Times;
+	mso-fareast-font-family:"Times New Roman";
+	mso-fareast-theme-font:minor-fareast;
+	mso-bidi-font-family:"Times New Roman";
+	mso-bidi-theme-font:minor-bidi;}
+a:link, span.MsoHyperlink
+	{mso-style-noshow:yes;
+	mso-style-priority:99;
+	color:blue;
+	text-decoration:underline;
+	text-underline:single;}
+a:visited, span.MsoHyperlinkFollowed
+	{mso-style-noshow:yes;
+	mso-style-priority:99;
+	color:purple;
+	text-decoration:underline;
+	text-underline:single;}
+span.Heading1Char
+	{mso-style-name:"Heading 1 Char";
+	mso-style-priority:9;
+	mso-style-unhide:no;
+	mso-style-locked:yes;
+	mso-style-link:"Heading 1";
+	mso-ansi-font-size:16.0pt;
+	mso-bidi-font-size:16.0pt;
+	font-family:"Calibri Light";
+	mso-ascii-font-family:"Calibri Light";
+	mso-ascii-theme-font:major-latin;
+	mso-fareast-font-family:"Times New Roman";
+	mso-fareast-theme-font:major-fareast;
+	mso-hansi-font-family:"Calibri Light";
+	mso-hansi-theme-font:major-latin;
+	mso-bidi-font-family:"Times New Roman";
+	mso-bidi-theme-font:major-bidi;
+	color:#2D4F8E;
+	mso-themecolor:accent1;
+	mso-themeshade:181;
+	font-weight:bold;}
+p.p2, li.p2, div.p2
+	{mso-style-name:p2;
+	mso-style-unhide:no;
+	margin:0in;
+	margin-bottom:.0001pt;
+	mso-pagination:widow-orphan;
+	font-size:9.0pt;
+	font-family:Arial;
+	mso-fareast-font-family:"Times New Roman";
+	mso-fareast-theme-font:minor-fareast;}
+p.p3, li.p3, div.p3
+	{mso-style-name:p3;
+	mso-style-unhide:no;
+	margin:0in;
+	margin-bottom:.0001pt;
+	mso-pagination:widow-orphan;
+	font-size:9.0pt;
+	font-family:Arial;
+	mso-fareast-font-family:"Times New Roman";
+	mso-fareast-theme-font:minor-fareast;}
+p.p4, li.p4, div.p4
+	{mso-style-name:p4;
+	mso-style-unhide:no;
+	margin:0in;
+	margin-bottom:.0001pt;
+	mso-pagination:widow-orphan;
+	font-size:9.0pt;
+	font-family:Arial;
+	mso-fareast-font-family:"Times New Roman";
+	mso-fareast-theme-font:minor-fareast;
+	color:#232323;}
+p.p5, li.p5, div.p5
+	{mso-style-name:p5;
+	mso-style-unhide:no;
+	margin:0in;
+	margin-bottom:.0001pt;
+	mso-pagination:widow-orphan;
+	font-size:9.0pt;
+	font-family:Arial;
+	mso-fareast-font-family:"Times New Roman";
+	mso-fareast-theme-font:minor-fareast;
+	color:#232323;}
+p.p6, li.p6, div.p6
+	{mso-style-name:p6;
+	mso-style-unhide:no;
+	margin:0in;
+	margin-bottom:.0001pt;
+	mso-pagination:widow-orphan;
+	font-size:10.0pt;
+	font-family:Arial;
+	mso-fareast-font-family:"Times New Roman";
+	mso-fareast-theme-font:minor-fareast;
+	color:#232323;}
+p.p7, li.p7, div.p7
+	{mso-style-name:p7;
+	mso-style-unhide:no;
+	margin:0in;
+	margin-bottom:.0001pt;
+	mso-pagination:widow-orphan;
+	font-size:10.0pt;
+	font-family:Arial;
+	mso-fareast-font-family:"Times New Roman";
+	mso-fareast-theme-font:minor-fareast;
+	color:#232323;}
+p.p8, li.p8, div.p8
+	{mso-style-name:p8;
+	mso-style-unhide:no;
+	margin:0in;
+	margin-bottom:.0001pt;
+	mso-pagination:widow-orphan;
+	font-size:10.5pt;
+	font-family:Times;
+	mso-fareast-font-family:"Times New Roman";
+	mso-fareast-theme-font:minor-fareast;
+	mso-bidi-font-family:"Times New Roman";
+	mso-bidi-theme-font:minor-bidi;}
+span.s1
+	{mso-style-name:s1;
+	mso-style-unhide:no;
+	color:#042EEE;
+	text-decoration:underline;
+	text-underline:single;}
+span.s2
+	{mso-style-name:s2;
+	mso-style-unhide:no;
+	mso-ansi-font-size:9.0pt;
+	mso-bidi-font-size:9.0pt;
+	font-family:Courier;
+	mso-ascii-font-family:Courier;
+	mso-hansi-font-family:Courier;}
+span.s3
+	{mso-style-name:s3;
+	mso-style-unhide:no;
+	color:black;}
+span.s4
+	{mso-style-name:s4;
+	mso-style-unhide:no;
+	color:#232323;}
+span.s5
+	{mso-style-name:s5;
+	mso-style-unhide:no;
+	color:#3D578C;}
+span.s6
+	{mso-style-name:s6;
+	mso-style-unhide:no;
+	color:#0000EE;
+	text-decoration:underline;
+	text-underline:single;}
+span.s7
+	{mso-style-name:s7;
+	mso-style-unhide:no;
+	mso-ansi-font-size:10.0pt;
+	mso-bidi-font-size:10.0pt;
+	font-family:Arial;
+	mso-ascii-font-family:Arial;
+	mso-hansi-font-family:Arial;
+	mso-bidi-font-family:Arial;}
+span.s8
+	{mso-style-name:s8;
+	mso-style-unhide:no;
+	color:black;
+	background:#FBFCFD;}
+span.s9
+	{mso-style-name:s9;
+	mso-style-unhide:no;
+	mso-ansi-font-size:9.0pt;
+	mso-bidi-font-size:9.0pt;
+	font-family:Courier;
+	mso-ascii-font-family:Courier;
+	mso-hansi-font-family:Courier;
+	color:black;
+	background:#FBFCFD;}
+span.s10
+	{mso-style-name:s10;
+	mso-style-unhide:no;
+	mso-ansi-font-size:9.0pt;
+	mso-bidi-font-size:9.0pt;
+	font-family:Courier;
+	mso-ascii-font-family:Courier;
+	mso-hansi-font-family:Courier;
+	color:#4665A2;}
+span.s11
+	{mso-style-name:s11;
+	mso-style-unhide:no;
+	background:#FFFECC;}
+.MsoChpDefault
+	{mso-style-type:export-only;
+	mso-default-props:yes;
+	font-size:10.0pt;
+	mso-ansi-font-size:10.0pt;
+	mso-bidi-font-size:10.0pt;}
+@page WordSection1
+	{size:8.5in 11.0in;
+	margin:1.0in 1.25in 1.0in 1.25in;
+	mso-header-margin:.5in;
+	mso-footer-margin:.5in;
+	mso-paper-source:0;}
+div.WordSection1
+	{page:WordSection1;}
+-->
+</style>
+<!--[if gte mso 10]>
+<style>
+ /* Style Definitions */
+table.MsoNormalTable
+	{mso-style-name:"Table Normal";
+	mso-tstyle-rowband-size:0;
+	mso-tstyle-colband-size:0;
+	mso-style-noshow:yes;
+	mso-style-priority:99;
+	mso-style-parent:"";
+	mso-padding-alt:0in 5.4pt 0in 5.4pt;
+	mso-para-margin:0in;
+	mso-para-margin-bottom:.0001pt;
+	mso-pagination:widow-orphan;
+	font-size:10.0pt;
+	font-family:"Times New Roman";}
+</style>
+<![endif]-->
+<meta http-equiv=Content-Style-Type content="text/css">
+<meta name=CocoaVersion content=1187.34>
+<!--[if gte mso 9]><xml>
+ <o:shapedefaults v:ext="edit" spidmax="1026"/>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+ <o:shapelayout v:ext="edit">
+  <o:idmap v:ext="edit" data="1"/>
+ </o:shapelayout></xml><![endif]-->
+</head>
+
+<body bgcolor=white lang=EN-US link=blue vlink=purple style='tab-interval:.5in'>
+
+<div class=WordSection1>
+
+<h1 style='margin-top:0in;margin-right:0in;margin-bottom:12.0pt;margin-left:
+0in'><span style='font-size:18.0pt;mso-fareast-font-family:"Times New Roman";
+mso-bidi-font-family:"Times New Roman"'>Frequently Asked Questions about
+Combinatorial BLAS<o:p></o:p></span></h1>
+
+<p class=p2>Go <a
+href="http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/index.html"><span class=s1>back</span></a>
+to the the Combinatorial BLAS home page.</p>
+
+<p class=p3><o:p>&nbsp;</o:p></p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q1: </b>I would like to use your Combinatorial BLAS code for
+some of my experiments which involve sparse matrix multiplication. However, it
+is not clear how to write an output of <span class=s2>PSpGEMM(…)</span>
+function to a file. I've tried to use &quot;put&quot; function of SpParMat, but
+it outputs part of the matrix that corresponds to particular process and not
+the whole matrix. Is there a way to do it using your code?</p>
+
+<p class=p3><o:p>&nbsp;</o:p></p>
+
+<p class=p2><span class=s3><b>A1:</b> Yes, </span><span class=s2>SpParMat::SaveGathered(…)</span>
+will create a single file output (albeit slow) sorted with increasing row id's
+when called like<span class=s2> A.SaveGathered(&quot;product.txt&quot;)</span>.
+The caveat is that<span class=apple-converted-space><span style='font-family:
+Arial'>&nbsp; </span></span>&quot;gathered&quot; I/O in human readable form is
+quite slow due to serialization on large processor counts. It should only be
+used for debugging, ideally.<span class=apple-converted-space><span
+style='font-family:Arial'>&nbsp;For vectors, </span></span>we have a much much
+faster version:<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;
+</span></span><span class=s2>FullyDistVec::ParallelWrite (…), </span>which
+should be used instead. SpParMat will also get a ParallelWrite soon.</p>
+
+<p class=p7>----</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q2</b>: Does Combinatorial BLAS support in-node multithreading?<span
+class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p2><span class=s4><b>A2:</b> </span>Almost all expensive primitives
+(SpGEMM, SpMV with sparse vectors, SpMV with dense vectors, EWiseMult, Apply,
+Set) are hybrid multithreaded within a socket. Read this <a
+href="http://gauss.cs.ucsb.edu/~aydin/CombBLAS_FILES/CombBLASv1.2_threading.txt"><span
+class=s5>example</span><span class=s6>.</span></a> </p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q3:</b> Reading/writing text files is really slow for my
+purposes, what can I do?</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p2><span class=s4><b>A3:</b> </span>Starting from version 1.6, we now have
+extremely fast matrix market (text) file reading, check out
+SpParMat::ParallelReadMM() and <span class=s2>FullyDistVec::ParallelReadMM()</span></p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q4:</b> Is there a preferred way to prune elements from a
+SpParMat according to a predicate?</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>A4:</b> Yes,<span class=s7><span style='font-size:10.0pt'> </span></span><span
+class=s2>SpParMat::Prune(…)</span> will do it according to a predicate. An
+overloaded version of the same function, <span class=s2>SpParMat::Prune(ri,ci)</span>
+will <span class=s8>prune all entries whose row indices are in </span><span
+class=s9>ri </span><span class=s8>and column indices are in </span><span
+class=s9>ci</span></p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q5:</b> I am trying to run CombBLAS on Windows but the MS MPI does
+not seem to support MPI C++ bindings.</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p2><b>A5: </b>Combinatorial BLAS recently (version 1.3.0) switched to
+C-API for all its internal MPI calls. After that, we've also compiled CombLAS
+on a windows machine without problems. However, we recommend using an open
+source MPI for windows too, such as MPICH-2.</p>
+
+<p class=p3><o:p>&nbsp;</o:p></p>
+
+<p class=p2>---</p>
+
+<p class=p3><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q6:</b> I would like to use Combinatorial BLAS for some parallel
+sparse matrix-matrix multiplications. This works quite well, however when I try
+to assign a m x 1 sparse matrix (actually a vector) to the first column of an
+existing sparse matrix with SpAsgn I get an error saying: &quot;Matrix is too
+small to be splitted&quot;. Is this because it's not possible to use SpAsgn on
+vector-like matrices?</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>A6: </b>SpAsgn internally uses a memory efficient <span
+class=s2>Mult_AnXBn_DoubleBuff</span> as opposed to&nbsp;<span class=s2>Mult_AnXBn_Synch</span>).
+You might probably go into&nbsp;<a
+href="http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/class_sp_par_mat.html#ae510b0084843dc7d7c7d5c6572f5ef12"><span
+class=s10>SpParMat&lt;IT,NT,DER&gt;::SpAsgn</span></a><span class=s2>(...)</span>
+and change occuranges of&nbsp;<span class=s2>Mult_AnXBn_DoubleBuff</span>
+to&nbsp;<span class=s2>Mult_AnXBn_Synch</span>.&nbsp;However, this will likely
+only solve your problem for the serial case. because ComBBLAS can not
+effectively 2D decompose an m x 1 matrix: each dimension should ideally be at
+least sqrt(p).&nbsp;It is much better to represent that vector as a <span
+class=s2>FullyDistSpVec</span>.</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q7: </b>Starting from a general sparse matrix Z, I want to
+construct the symmetric matrix M: [[X'X; X'Z];[Z'X; Z'Z]], where X is a vector
+of 1's. Thus the element at position (1,1) is simply the number of columns of
+Z, and the vector Z'X contains the sums per column of Z. For now, I have a
+working code, but it is quite sloppy because I do not find a function for which
+I can easily increase the dimension of a sparse matrix or even set an element
+to a specific value. Is there any function in Combinatorial BLAS which can do
+this?<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>A7:</b> Not out of the box. You don't want an SpAsgn or any
+variant of it because it can't grow the matrix. You want some sort of matrix
+append. How about using Find(…) and Sparse(…)?&nbsp; The Matlab care of what
+you want to do is:</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5>X = ones(size(Z,2),1)&nbsp;</p>
+
+<p class=p5>M = [X' * X, X' * Z; Z'* X, Z' * Z]</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5>Supporting such a general concatenation efficiently might be hard
+to add at this point. Instead,<span class=apple-converted-space><span
+style='font-family:Arial'>&nbsp; </span></span>there is a Concatenate(…)
+function for vectors. Armed with Concatenate(…), find(), and the sparse-like
+constructor, one can solve your problem. &nbsp;Check out the working example in
+ReleaseTests/FindSparse.cpp</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q8:</b> Does CombBLAS include the API to perform a symmetric
+permutation on a matrix, as explained in your <a
+href="http://gauss.cs.ucsb.edu/~aydin/spgemm_sisc12.pdf"><span class=s6>SISC
+paper</span></a>?&nbsp;</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>A8:</b> Yes it does. Check out the
+ReleaseTests/IndexingTiming.cpp for an example.</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q9:</b> How can I use small test case to see whether the
+operation on matrix is correct? In other words, how do I print all the
+information of a matrix with each value in matrix?&nbsp;</p>
+
+<p class=p5>I can use PrintInfo to print basic information, but it only gives
+me number of rows and columns and nnz</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p2><span class=s4><b>A9:</b> Our recommendation is </span>to use <span
+class=s2>SaveGathered(…) </span>to dump the whole matrix into a file in triples
+(matrix market) format. For vectors, we have a much much faster version:<span
+class=apple-converted-space><span style='font-family:Arial'>&nbsp; </span></span><span
+class=s2>FullyDistVec::ParallelWrite (…)</span></p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p5><span class=s3><b>Q10:</b> </span>Does CombBLAS code run on any
+graph size or there is some limitation on the dimension of the matrix A. I mean
+should it be a multiple of sqrt(p) where p is total number of processors.&nbsp;</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>A10:</b> No, the matrix dimension does not have to be a multiple
+of sqrt(p) but it should be bigger than sqrt(p). In other words you can have a
+5x5 matrix on 4 processors but not on 36 processors. We don't really see the
+point of using more than |V|^2 processors.</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5>---</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q11:</b> My comparison results on real graph inputs revealed
+something weird. In input loc-gowalla, how can 16 processors time(called
+time_16) and&nbsp;</p>
+
+<p class=p5>64 processors time(called time_64) which time_64*4&lt;time_16
+&nbsp;which is more than linear scale?&nbsp;</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>A11:</b> The complexity of the parallel algorithm drops as
+sub-matrices owned by each processor gets sparser. In particular, it is
+proportional to O(flops x log(ni)) where ni is the size of the intersection of
+the set of nonzero columns of Aik and nonzero rows of Bkj for A*B. What might
+happen as p increases is that there is a phase transition that makes ni drop
+significantly for your input (for p=64, each sub-matrix will have only ~1.2
+nonzeros per row or column). More details are in the <a
+href="http://gauss.cs.ucsb.edu/~aydin/spgemm_sisc12.pdf"><span class=s6>SISC
+paper</span></a> and the references therein. Hope this makes sense. This is why
+I don't suggest people use CombBLAS for small p (&lt; 40) because it is not on
+the top of its game for small number of processors.&nbsp;</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5>---<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q12:</b> Should the input file have nodes numbered from 1 or it
+is fine if the nodes are numbered from 0?</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>A12:</b> If you're using the human readable matrix market format
+as your input, then it should be 1-indexed.<span class=apple-converted-space><span
+style='font-family:Arial'>&nbsp;</span></span></p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q13: </b>I'm wondering for breadth-first-search, under the hood
+does the matrix-vector multiplication method change based on the sparsity of
+the frontier vector, or does the underlying matrix-vector multiplication assume
+the frontier is always sparse?</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>A13:</b> Depending on your definition of sparseness, the
+frontier is almost always sparse. We use the pragmatic definition of
+&quot;sparse&quot; in the sense that a vector is sparse if it is worth taking
+advantage of the sparsity in there. I'd guess, for a dense vector assumption to
+be competitive, it would have to have at least 1/3 of its potential locations
+nonzero. However, I might be wrong (and you're welcome to prove me wrong). To
+answer your question more directly, CombBLAS supports both dense and sparse
+right hand side vectors, but the specific BFS implementation does not
+adapt.&nbsp;</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q14: </b>Could you briefly explain the difference in your
+implementations of matrix-sparse vector and matrix-dense vector multiply? For
+example, is the sparse vector case a write-based approach: Every element
+updates all of its neighbors (from a graph-theoretic standpoint) locations in
+the output vector; and the dense vector case a read-based approach: Every
+element reads some value from each of its neighbors and updates its own entry
+in the resulting vector?</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>A14:</b> Sparse matrix-sparse vector is &quot;right hand side
+vector structure&quot; driven. In y = A*x, for each nonzero x_i, we scale the
+column A(:,i) with that and merge the scaled sparse columns results into y. The
+computation boils down into merging sparse columns into one. <span class=s11>Combinatorial</span>
+<span class=s11>BLAS</span> is a matrix-vector based library, so thinking in
+terms of updates on single entries is probably not the right abstraction.</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5>Sparse matrix-dense vector is slightly different in the sense that
+it is driven by the matrix structure; you basically stream the matrix. The
+correctness of both operations are handled by a SPA-like or heap-like data
+structure that merges multiple intermediate values contributing to the same
+output location; no atomics are used.</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
+
+<p class=p3><o:p>&nbsp;</o:p></p>
+
+<p class=p5><span class=s3><b>Q15: </b>I </span>would like to get your opinion
+on how sparse-matrix based implementations compare with more native
+implementations</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><span class=s3><b>A15: </b></span>Sparse matrix abstraction, like
+any abstraction, will leave some performance on the table. In particular it is
+prone to performing extra passes over data or creating extra temporaries (if
+you've ever programmed in Matlab; this is similar). On the other hand, sparse
+matrix abstraction gives you &quot;primitives&quot; to implement graph
+&quot;algorithms&quot; as opposed to the algorithms themselves. For instance,
+CombBLAS has sparse matrix x sparse vector over a semiring as opposed to BFS,
+because now using the same primitive one can implement MIS (maximal independent
+set) too, only by changing the&nbsp;semiring. Or one can perform run time
+filtering on edges based on the attributes, similarly by changing the semiring
+functions (therefore extending functionality to semantic graphs). Indeed this
+is what we've done in our upcoming IPDPS'13 paper.</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p2>---</p>
+
+<p class=p3><o:p>&nbsp;</o:p></p>
+
+<p class=p2><b>Q16: </b>Is there an effort to incorporate the bottom-up BFS of
+Scott Beamer into CombBLAS?</p>
+
+<p class=p3><o:p>&nbsp;</o:p></p>
+
+<p class=p2><b>A16: </b>Yes, it is already done. Just use the dobfs executable
+(made from DirOptBFS.cpp).</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>Q17: </b>My serial code is faster than CombBLAS on a single
+core.</p>
+
+<p class=p4><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b>A17: </b>I believe that. CombBLAS targets
+&quot;scalability&quot;, not optimizing the single core performance.</p>
+
+<p class=p5>&nbsp;</p>
+
+<p class=p5>Examples:</p>
+
+<p class=p5>- think about the 2D BFS. CombBLAS does not use a CSR like data
+structure because that is not memory scalable due to problems of <a
+href="http://gauss.cs.ucsb.edu/publication/hypersparse-ipdps08.pdf"><span
+class=s6>hypersparsity</span></a> in large concurrencies. Instead
+CombBLAS&nbsp;opts to use a slower (about 2x around 1000 cores) but memory
+scalable format called DCSC. &nbsp;</p>
+
+<p class=p5>- think about betweenness centrality which uses sparse
+matrix-matrix multiply. CombBLAS doesn't use the fastest serial algorithm as
+its subroutine because it doesn't &nbsp;scale to thousands of cores. Instead it
+uses a outer-product algorithm that is significantly slower for p=1, but scales
+indefinitely.</p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p7>---<span class=apple-converted-space><span style='font-family:Arial'>&nbsp;</span></span></p>
+
+<p class=p6><o:p>&nbsp;</o:p></p>
+
+<p class=p2><b>Q18:</b> Looking at the output of your Graph500 application, I
+noticed a large number of self-edges removed. That’s very interesting.</p>
+
+<p class=p3><o:p>&nbsp;</o:p></p>
+
+<p class=p2><b>A18: </b>The duplicate edges problem is inherent to the R-MAT
+generator on large scale, unless some special kind of noise is added. Check
+here for a great analysis of this phenomenon: <a
+href="http://arxiv.org/abs/1102.5046"><span class=s6>http://arxiv.org/abs/1102.5046</span></a></p>
+
+<p class=p3><o:p>&nbsp;</o:p></p>
+
+<p class=p2>---</p>
+
+<p class=p3><o:p>&nbsp;</o:p></p>
+
+<p class=p2><b>Q19:</b> How are you counting the number of edges traversed in
+Graph500? Is this still using the original verify.c file provided with the
+reference version of the Graph500 benchmark and passing in the parent tree?</p>
+
+<p class=p3><o:p>&nbsp;</o:p></p>
+
+<p class=p5><span class=s3><b>A19: </b></span>It is calculated by summing the
+degrees of the discovered vertices using <span class=s2>EWiseMult(…)</span>
+followed by a <span class=s2>Reduce(…)</span>. Degrees are pre-symmetrization
+(original edges), so we're not over-counting. However, we count self-loops and
+duplicates as mentioned in the benchmark specs.</p>
+
+<p class=p5><o:p>&nbsp;</o:p></p>
+
+<p class=p5>---</p>
+
+<p class=p5><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b style='mso-bidi-font-weight:normal'>Q20:</b> My computations
+finishes fine but I get an “Attempting to use an MPI routine after finalizing
+MPICH” afterwards.</p>
+
+<p class=p5><o:p>&nbsp;</o:p></p>
+
+<p class=p5><b style='mso-bidi-font-weight:normal'>A20:</b> To avoid the
+finalization error, please imitate an example such as MultTest.cpp: <a
+href="http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/_mult_test_8cpp_source.html">http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/_mult_test_8cpp_source.html</a></p>
+
+<p class=p5>The curly brackets around the code are intentional. Since
+distributed objects have MPI related pointers in them, those pointers are
+released once the destructors are called. In C++ (at least until C++11) there
+isn’t a good way to call the destructor manually, so the destructor is called
+immediately before the program exists, which is after the MPI_Finalize. Since
+the MPI related objects are destructed after MPI_Finalize, you see this error.
+Try the curly brackets approach.</p>
+
+<p class=p8><span style='mso-bidi-font-family:"Times New Roman"'><o:p>&nbsp;</o:p></span></p>
+
+<p class=p2>Go <a
+href="http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/index.html"><span class=s6>back</span></a>
+to the the Combinatorial BLAS home page.</p>
+
+</div>
+
+</body>
+
+</html>
diff --git a/ReleaseTests/CMakeLists.txt b/ReleaseTests/CMakeLists.txt
index e37c57d1..605c0395 100644
--- a/ReleaseTests/CMakeLists.txt
+++ b/ReleaseTests/CMakeLists.txt
@@ -1,7 +1,13 @@
 # Top level directory has the include files
 
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 ADD_EXECUTABLE( MultTiming MultTiming.cpp )
 ADD_EXECUTABLE( MultTest MultTest.cpp )
+cuda_add_executable( MultTimingCUDA MultTimingCUDA.cu)
+target_compile_options(MultTimingCUDA PRIVATE -Mlarge_arrays)
+cuda_add_executable( MultAccuracyCUDA MultAccuracyCUDA.cu)
+
+
 ADD_EXECUTABLE( ReduceTest ReduceTest.cpp )
 ADD_EXECUTABLE( TransposeTest TransposeTest.cpp )
 ADD_EXECUTABLE( IteratorTest IteratorTest.cpp )
@@ -21,7 +27,11 @@ ADD_EXECUTABLE( KTipsTest KTipsTest.cpp )
 
 TARGET_LINK_LIBRARIES( MultTiming CombBLAS)
 TARGET_LINK_LIBRARIES( MultTest CombBLAS)
-TARGET_LINK_LIBRARIES( ReduceTest CombBLAS)
+
+
+TARGET_LINK_LIBRARIES( MultTimingCUDA CombBLAS)
+TARGET_LINK_LIBRARIES( MultAccuracyCUDA CombBLAS)
+TARGET_LINK_LIBRARIES( ReduceTest CombBLAS MPI::MPI_CXX)
 TARGET_LINK_LIBRARIES( TransposeTest CombBLAS)
 TARGET_LINK_LIBRARIES( IteratorTest CombBLAS)
 TARGET_LINK_LIBRARIES( IndexingTest CombBLAS)
@@ -40,6 +50,7 @@ TARGET_LINK_LIBRARIES( KTipsTest CombBLAS)
 
 ADD_TEST(NAME GenMMWrite_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:GenWrMat> 20 16 1 scale20_ef16_symmetric.mtx)
 ADD_TEST(NAME Multiplication_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:MultTest> ../TESTDATA/rmat_scale16_A.mtx ../TESTDATA/rmat_scale16_B.mtx ../TESTDATA/rmat_scale16_productAB.mtx ../TESTDATA/x_65536_halfdense.txt ../TESTDATA/y_65536_halfdense.txt )
+ADD_TEST(NAME Multiplication_Test_CUDA COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:MultTimingCUDA> ../TESTDATA/rmat_scale16_A.mtx ../TESTDATA/rmat_scale16_B.mtx ../TESTDATA/rmat_scale16_productAB.mtx ../TESTDATA/x_65536_halfdense.txt ../TESTDATA/y_65536_halfdense.txt )
 ADD_TEST(NAME SpGEMM3D_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 16 $<TARGET_FILE:SpGEMM3DTest> ../TESTDATA/rmat_scale16_A.mtx ../TESTDATA/rmat_scale16_B.mtx ../TESTDATA/rmat_scale16_productAB.mtx )
 ADD_TEST(NAME HashSpGEMMTest COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 $<TARGET_FILE:HashSpGEMMTest> ../TESTDATA/rmat_scale16_A.mtx ../TESTDATA/rmat_scale16_B.mtx ../TESTDATA/rmat_scale16_productAB.mtx )
 ADD_TEST(NAME Reduction_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:ReduceTest> ../TESTDATA/sprand10000 ../TESTDATA/sprand10000_sumcols ../TESTDATA/sprand10000_sumrows)
@@ -48,4 +59,4 @@ ADD_TEST(NAME Transpose_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $<TARG
 ADD_TEST(NAME Indexing_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:IndexingTest> ../TESTDATA B_100x100.txt B_10x30_Indexed.txt rand10outta100.txt rand30outta100.txt)
 ADD_TEST(NAME SpAsgn_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:SpAsgnTest> ../TESTDATA A_100x100.txt A_with20x30hole.txt dense_20x30matrix.txt A_wdenseblocks.txt 20outta100.txt 30outta100.txt)
 ADD_TEST(NAME GalerkinNew_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:GalerkinNew> ../TESTDATA/grid3d_k5.txt ../TESTDATA/offdiag_grid3d_k5.txt ../TESTDATA/diag_grid3d_k5.txt ../TESTDATA/restrict_T_grid3d_k5.txt)
-ADD_TEST(NAME FindSparse_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:FindSparse> ../TESTDATA findmatrix.txt)
+ADD_TEST(NAME FindSparse_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:FindSparse> ../TESTDATA findmatrix.txt)
\ No newline at end of file
diff --git a/ReleaseTests/GALERKIN/failed/galerkin1024.1284895.err b/ReleaseTests/GALERKIN/failed/galerkin1024.1284895.err
index 06917ac1..fc3c51be 100644
--- a/ReleaseTests/GALERKIN/failed/galerkin1024.1284895.err
+++ b/ReleaseTests/GALERKIN/failed/galerkin1024.1284895.err
@@ -1,23 +1,23 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/galerkin_scale23_order4
-Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
 [i140-203:00879] *** Process received signal ***
 [i140-203:00879] Signal: Segmentation fault (11)
 [i140-203:00879] Signal code: Address not mapped (1)
diff --git a/ReleaseTests/GalerkinResults/Galerkin/failed/galerkin1024.1284895.err b/ReleaseTests/GalerkinResults/Galerkin/failed/galerkin1024.1284895.err
index 06917ac1..fc3c51be 100644
--- a/ReleaseTests/GalerkinResults/Galerkin/failed/galerkin1024.1284895.err
+++ b/ReleaseTests/GalerkinResults/Galerkin/failed/galerkin1024.1284895.err
@@ -1,23 +1,23 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/galerkin_scale23_order4
-Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
 [i140-203:00879] *** Process received signal ***
 [i140-203:00879] Signal: Segmentation fault (11)
 [i140-203:00879] Signal code: Address not mapped (1)
diff --git a/ReleaseTests/GalerkinResults/failed/galerkin1024.1284895.err b/ReleaseTests/GalerkinResults/failed/galerkin1024.1284895.err
index 06917ac1..fc3c51be 100644
--- a/ReleaseTests/GalerkinResults/failed/galerkin1024.1284895.err
+++ b/ReleaseTests/GalerkinResults/failed/galerkin1024.1284895.err
@@ -1,23 +1,23 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/galerkin_scale23_order4
-Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
 [i140-203:00879] *** Process received signal ***
 [i140-203:00879] Signal: Segmentation fault (11)
 [i140-203:00879] Signal code: Address not mapped (1)
diff --git a/ReleaseTests/InducedSubgraphsTest.cpp b/ReleaseTests/InducedSubgraphsTest.cpp
index 22f6a446..9e6aef17 100644
--- a/ReleaseTests/InducedSubgraphsTest.cpp
+++ b/ReleaseTests/InducedSubgraphsTest.cpp
@@ -1,49 +1,56 @@
-#include <mpi.h>
-#include <iostream>
-#include <functional>
+#include "CombBLAS/CombBLAS.h"
 #include <algorithm>
-#include <vector>
+#include <functional>
+#include <iostream>
+#include <mpi.h>
 #include <sstream>
-#include "CombBLAS/CombBLAS.h"
-
-int main(int argc, char *argv[])
-{
-    int nprocs, myrank;
-    MPI_Init(&argc, &argv);
-    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-    if (argc < 3) {
-        if (!myrank)
-            std::cerr << "Usage: ./Subgraphs2ProcsTest <MatrixA> <VectorAssignments>" << std::endl;
-        MPI_Finalize();
-        return -1;
-    }
-    {
-        if (!myrank) std::cerr << "processor grid: (" << std::sqrt(nprocs) << " x " << std::sqrt(nprocs) << ")" << std::endl;
-
-        std::shared_ptr<combblas::CommGrid> fullWorld;
-        fullWorld.reset(new combblas::CommGrid(MPI_COMM_WORLD, 0, 0));
-
-        combblas::SpParMat<int, double, combblas::SpCCols<int, double> > A(fullWorld);
-        combblas::FullyDistVec<int, int> assignments(A.getcommgrid());
-
-        A.ParallelReadMM(std::string(argv[1]), true, combblas::maximum<double>());
-        assignments.ParallelRead(std::string(argv[2]), true, combblas::maximum<int>());
-
-        std::vector<int> local_idx_map;
-
-        combblas::SpCCols<int, double> locmat = A.InducedSubgraphs2Procs(assignments, local_idx_map);
+#include <vector>
 
-        for (auto colit = locmat.begcol(); colit != locmat.endcol(); ++colit) {
-            for (auto nzit = locmat.begnz(colit); nzit != locmat.endnz(colit); ++nzit) {
-                std::cout << myrank << ": " << local_idx_map[nzit.rowid()]+1 << "\t" << local_idx_map[colit.colid()]+1 << "\t" << nzit.value() << std::endl;
-            }
-        }
-        std::cout << std::endl;
+int main(int argc, char *argv[]) {
+  int nprocs, myrank;
+  MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
 
+  if (argc < 3) {
+    if (!myrank)
+      std::cerr << "Usage: ./Subgraphs2ProcsTest <MatrixA> <VectorAssignments>"
+                << std::endl;
+    MPI_Finalize();
+    return -1;
+  }
+  {
+    if (!myrank)
+      std::cerr << "processor grid: (" << std::sqrt(nprocs) << " x "
+                << std::sqrt(nprocs) << ")" << std::endl;
+
+    std::shared_ptr<combblas::CommGrid> fullWorld;
+    fullWorld.reset(new combblas::CommGrid(MPI_COMM_WORLD, 0, 0));
+
+    combblas::SpParMat<int, double, combblas::SpCCols<int, double>> A(
+        fullWorld);
+    combblas::FullyDistVec<int, int> assignments(A.getcommgrid());
+
+    A.ParallelReadMM(std::string(argv[1]), true, combblas::maximum<double>());
+    assignments.ParallelRead(std::string(argv[2]), true,
+                             combblas::maximum<int>());
+
+    std::vector<int> local_idx_map;
+
+    combblas::SpCCols<int, double> locmat =
+        A.InducedSubgraphs2Procs(assignments, local_idx_map);
+
+    for (auto colit = locmat.begcol(); colit != locmat.endcol(); ++colit) {
+      for (auto nzit = locmat.begnz(colit); nzit != locmat.endnz(colit);
+           ++nzit) {
+        std::cout << myrank << ": " << local_idx_map[nzit.rowid()] + 1 << "\t"
+                  << local_idx_map[colit.colid()] + 1 << "\t" << nzit.value()
+                  << std::endl;
+      }
     }
+    std::cout << std::endl;
+  }
 
-    MPI_Finalize();
-    return 0;
+  MPI_Finalize();
+  return 0;
 }
diff --git a/ReleaseTests/MultAccuracyCUDA.cu b/ReleaseTests/MultAccuracyCUDA.cu
new file mode 100644
index 00000000..a9ba1e0a
--- /dev/null
+++ b/ReleaseTests/MultAccuracyCUDA.cu
@@ -0,0 +1,172 @@
+/****************************************************************/
+/* Parallel Combinatorial BLAS Library (for Graph Computations) */
+/* version 1.6 -------------------------------------------------*/
+/* date: 6/15/2017 ---------------------------------------------*/
+/* authors: Ariful Azad, Aydin Buluc  --------------------------*/
+/****************************************************************/
+/*
+ Copyright (c) 2010-2017, The Regents of the University of California
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+// #include <cuda.h>
+
+#ifdef __CUDACC__
+
+#include <mpi.h>
+#include <sys/time.h>
+#include <iostream>
+#include <functional>
+#include <algorithm>
+#include <vector>
+#include <sstream>
+#include "CombBLAS/CombBLAS.h"
+// #include "../include/GALATIC/source/device/Multiply.cuh"
+
+using namespace std;
+using namespace combblas;
+
+#ifdef TIMING
+double cblas_alltoalltime;
+double cblas_allgathertime;
+#endif
+
+#ifdef _OPENMP
+int cblas_splits = omp_get_max_threads();
+#else
+int cblas_splits = 1;
+#endif
+
+#define ElementType double
+int ITERATIONS = 50;
+
+// Simple helper class for declarations: Just the numerical type is templated
+// The index type and the sequential matrix type stays the same for the whole code
+// In this case, they are "int" and "SpDCCols"
+template <class NT>
+class PSpMat
+{
+public:
+	typedef SpDCCols<uint32_t, NT> DCCols;
+	typedef SpParMat<uint32_t, NT, DCCols> MPI_DCCols;
+};
+
+// Outline of debug stages
+// stage = 0: LocalHybrid does not run/immediately returns
+// stage = 1: LocalHybrid mallocs and transposes as needed, but returns immediately after
+// stage = 2: LocalHybrid runs the kernel, but does not perform cleanup
+// stage = 3: Full run of LocalHybrid
+// stages 1 & 2 may lead to memory leaks, be aware on memory limited systems
+int main(int argc, char *argv[])
+{
+#ifdef GPU_ENABLED
+// SpParHelper::Print("GPU ENABLED\n");
+#endif
+	int nprocs, myrank;
+	int host_rank;
+	MPI_Init(&argc, &argv);
+	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+	typedef PlusTimesSRing<ElementType, ElementType> PTDOUBLEDOUBLE;
+
+	if (argc < 3)
+	{
+		if (myrank == 0)
+		{
+			cout << "Usage: ./MultTest <MatrixA> <MatrixB> <MatrixC>" << endl;
+			cout << "<MatrixA>,<MatrixB>,<MatrixC> are absolute addresses, and files should be in triples format" << endl;
+		}
+		MPI_Finalize();
+		return -1;
+	}
+	{
+		string Aname(argv[1]);
+		string Bname(argv[2]);
+
+		if (myrank == 0 || nprocs == 1)
+		{
+			std::cout << Aname << std::endl;
+			std::cout << Bname << std::endl;
+		}
+		typedef PlusTimesSRing<double, double> MinPlusSRing;
+		typedef SelectMaxSRing<bool, int64_t> SR;
+
+		shared_ptr<CommGrid> fullWorld;
+		fullWorld.reset(new CommGrid(MPI_COMM_WORLD, 0, 0));
+
+		std::cout << "Constructing objects:" << std::endl;
+		// construct objects
+		PSpMat<double>::MPI_DCCols A(fullWorld);
+		PSpMat<double>::MPI_DCCols B(fullWorld);
+		PSpMat<double>::MPI_DCCols C(fullWorld);
+		PSpMat<double>::MPI_DCCols CControl(fullWorld);
+
+		A.ParallelReadMM(Aname, true, maximum<double>());
+#ifndef NOGEMM
+		B.ParallelReadMM(Bname, true, maximum<double>());
+
+#endif
+		A.PrintInfo();
+
+#ifndef NOGEMM
+		C = Mult_AnXBn_DoubleBuff_CUDA<PTDOUBLEDOUBLE, double, PSpMat<double>::DCCols>(A, B);
+		cudaDeviceSynchronize();
+		HANDLE_ERROR(cudaGetLastError());
+		C.PrintInfo();
+		cudaDeviceSynchronize();
+		{
+			CControl = Mult_AnXBn_DoubleBuff<PTDOUBLEDOUBLE, ElementType, PSpMat<ElementType>::DCCols>(A, B);
+			C.PrintInfo();
+			if (CControl == C)
+			{
+				SpParHelper::Print("Double buffered multiplication working correctly\n");
+			}
+			else
+			{
+				SpParHelper::Print("ERROR in double CUDA  buffered multiplication, from CPU!\n");
+				A.PrintInfo();
+				C.PrintInfo();
+				CControl.PrintInfo();
+				SpDCCols<uint32_t, double> spdcsc = C.seq();
+				Dcsc<uint32_t, double> *dcsc = C.seq().GetDCSC();
+				double maxdiff = 0;
+				double a = 0;
+				double b = 0;
+				for (int i = 0; i < spdcsc.getnnz(); ++i)
+				{
+					if (abs(dcsc->numx[i] - CControl.seq().GetDCSC()->numx[i]) > maxdiff)
+					{
+						maxdiff = abs(dcsc->numx[i] - CControl.seq().GetDCSC()->numx[i]);
+						a = dcsc->numx[i];
+						b = CControl.seq().GetDCSC()->numx[i];
+					}
+				}
+				std::cout << "MAX DIFF = " << maxdiff << std::endl;
+				std::cout << a << std::endl;
+				std::cout << b << std::endl;
+			}
+		}
+	}
+#endif
+
+MPI_Finalize();
+return 0;
+}
+#endif
\ No newline at end of file
diff --git a/ReleaseTests/MultTimingCUDA.cu b/ReleaseTests/MultTimingCUDA.cu
new file mode 100644
index 00000000..09d1e477
--- /dev/null
+++ b/ReleaseTests/MultTimingCUDA.cu
@@ -0,0 +1,280 @@
+/****************************************************************/
+/* Parallel Combinatorial BLAS Library (for Graph Computations) */
+/* version 1.6 -------------------------------------------------*/
+/* date: 6/15/2017 ---------------------------------------------*/
+/* authors: Ariful Azad, Aydin Buluc  --------------------------*/
+/****************************************************************/
+/*
+ Copyright (c) 2010-2017, The Regents of the University of California
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+// #include <cuda.h>
+
+#ifdef __CUDACC__
+
+#include <mpi.h>
+#include <sys/time.h>
+#include <iostream>
+#include <functional>
+#include <algorithm>
+#include <vector>
+#include <sstream>
+#include "CombBLAS/CombBLAS.h"
+// #include "../include/GALATIC/source/device/Multiply.cuh"
+
+using namespace std;
+using namespace combblas;
+
+#ifdef TIMING
+double cblas_alltoalltime;
+double cblas_allgathertime;
+#endif
+
+#ifdef _OPENMP
+int cblas_splits = omp_get_max_threads();
+#else
+int cblas_splits = 1;
+#endif
+
+#define ElementType double
+int ITERATIONS = 50;
+
+// Simple helper class for declarations: Just the numerical type is templated
+// The index type and the sequential matrix type stays the same for the whole code
+// In this case, they are "int" and "SpDCCols"
+template <class NT>
+class PSpMat
+{
+public:
+	typedef SpDCCols<uint32_t, NT> DCCols;
+	typedef SpParMat<uint32_t, NT, DCCols> MPI_DCCols;
+};
+
+// Outline of debug stages
+// stage = 0: LocalHybrid does not run/immediately returns
+// stage = 1: LocalHybrid mallocs and transposes as needed, but returns immediately after
+// stage = 2: LocalHybrid runs the kernel, but does not perform cleanup
+// stage = 3: Full run of LocalHybrid
+// stages 1 & 2 may lead to memory leaks, be aware on memory limited systems
+int main(int argc, char *argv[])
+{
+#ifdef GPU_ENABLED
+// SpParHelper::Print("GPU ENABLED\n");
+#endif
+	int nprocs, myrank;
+	int host_rank;
+	MPI_Init(&argc, &argv);
+	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+	typedef PlusTimesSRing<ElementType, ElementType> PTDOUBLEDOUBLE;	
+
+
+	if (argc < 4)
+	{
+		if (myrank == 0)
+		{
+			cout << "Usage: ./MultTest <MatrixA> <MatrixB> <MatrixC>" << endl;
+			cout << "<MatrixA>,<MatrixB>,<MatrixC> are absolute addresses, and files should be in triples format" << endl;
+		}
+		MPI_Finalize();
+		return -1;
+	}
+	{
+		string ITERS(argv[1]);
+		string COMMTEST(argv[2]);
+		string Aname(argv[3]);
+		string Bname(argv[4]);
+
+		if(myrank == 0 || nprocs == 1) {
+		std::cout << Aname << std::endl;
+		std::cout << Bname << std::endl;
+		std::cout << nprocs << std::endl;
+		std::string filename = "output" + Aname.substr(0, Aname.length() - 4) + ".txt";
+
+		FILE *f = fopen(filename.c_str(), "a");
+			if(f==NULL){printf("failed to open file: permission issue ?\n");exit(1);}
+			// cout << "Double buffered CUDA multiplications finished" << endl;
+			fprintf(f, "Input A: %s, with NPROCS: %i\n", Aname.c_str(), nprocs);
+			fclose(f);
+		}
+		ITERATIONS = std::stoi(ITERS);
+
+		bool COMMTESTON = std::stoi(COMMTEST) > 0;
+		//if(!COMMTESTON) GPUTradeoff = 1024 * 100 * 500;
+		MPI_Barrier(MPI_COMM_WORLD);
+		typedef PlusTimesSRing<double, double> MinPlusSRing;
+		typedef SelectMaxSRing<bool, int64_t> SR;
+
+		shared_ptr<CommGrid> fullWorld;
+		fullWorld.reset(new CommGrid(MPI_COMM_WORLD, 0, 0));
+
+		// construct objects
+		PSpMat<double>::MPI_DCCols A(fullWorld);
+		PSpMat<double>::MPI_DCCols B(fullWorld);
+		PSpMat<double>::MPI_DCCols C(fullWorld);
+
+		A.ParallelReadMM(Aname, true, maximum<double>());
+#ifndef NOGEMM
+		B.ParallelReadMM(Bname, true, maximum<double>());
+
+#endif
+		//A.PrintInfo();
+
+#ifndef NOGEMM
+		double t3 = MPI_Wtime();
+		C = Mult_AnXBn_DoubleBuff_CUDA<PTDOUBLEDOUBLE, double, PSpMat<double>::DCCols>(A, B);
+		cudaDeviceSynchronize();
+		HANDLE_ERROR(cudaGetLastError());
+		double t4 = MPI_Wtime();
+		std::cout << "Time taken: " << t4 - t3 << std::endl;
+		
+		C.PrintInfo();
+		cudaDeviceSynchronize();
+		{ // force the calling of C's destructor
+			t3 = MPI_Wtime();
+			//C = Mult_AnXBn_DoubleBuff<MinPlusSRing, ElementType, PSpMat<ElementType>::DCCols>(A, B);
+			C = Mult_AnXBn_DoubleBuff<PTDOUBLEDOUBLE, ElementType, PSpMat<ElementType>::DCCols>(A, B);
+			t4 = MPI_Wtime();
+			std::cout << "Time taken: " << t4 - t3 << std::endl;
+			C.PrintInfo();
+		}
+		MPI_Barrier(MPI_COMM_WORLD);
+		MPI_Pcontrol(1, "SpGEMM_DoubleBuff");
+		double t1 = MPI_Wtime(); // initilize (wall-clock) timer
+		for (int i = 0; i < ITERATIONS; i++)
+		{
+			C = Mult_AnXBn_DoubleBuff<PTDOUBLEDOUBLE, ElementType, PSpMat<ElementType>::DCCols>(A, B);
+		}
+		MPI_Barrier(MPI_COMM_WORLD);
+		double t2 = MPI_Wtime();
+		MPI_Pcontrol(-1, "SpGEMM_DoubleBuff");
+		if (myrank == 0 || nprocs == 1)
+			{
+				std::string filename = "output" + Aname.substr(0,Aname.length() - 4) + ".txt";
+				//std::cout << filename.c_str() << std::endl;
+				FILE *f = fopen(filename.c_str(), "a");
+				if(f==NULL){printf("failed to open file: permission issue ?\n");exit(1);}
+				// cout << "Double buffered CUDA multiplications finished" << endl;
+				fprintf(f, "CPU Time: %.6lf\n", (t2 - t1) / ((double) ITERATIONS));
+				fclose(f);
+			}
+			int maxhits = 0;
+		for (int j = 0; j < 500; ++j)
+		{
+			//if(!COMMTESTON) j = 500;
+			//std::cout << j << std::endl;
+			size_t free, total;
+			int id;
+			MPI_Comm_rank(MPI_COMM_WORLD, &id);
+			cudaMemGetInfo(&free, &total);
+			//std::cout << "GPU " << id << " memory: free=" << free << ", total=" << total << std::endl;
+
+			commtime = 0;
+			comms = 0;
+			datahits = 0;
+			rowshits = 0;
+			colhits = 0;
+			cudaDeviceSynchronize();
+			MPI_Barrier(MPI_COMM_WORLD);
+			MPI_Pcontrol(1, "SpGEMM_DoubleBuff");
+			{
+				C = Mult_AnXBn_DoubleBuff_CUDA<PTDOUBLEDOUBLE, double, PSpMat<double>::DCCols>(A, B);
+			}
+			
+			int svdhits = datahits + rowshits + colhits;
+			int commper = comms;
+			comms = 0;
+			datahits = 0;
+			rowshits = 0;
+			colhits = 0;
+			GPUTradeoff = 1024 * 100 * j;
+			MPI_Barrier(MPI_COMM_WORLD);
+			MPI_Pcontrol(1, "SpGEMM_DoubleBuff");
+			{
+				C = Mult_AnXBn_DoubleBuff_CUDA<PTDOUBLEDOUBLE, double, PSpMat<double>::DCCols>(A, B);
+			}
+			
+			bool allt;
+			int nnprocs;
+			MPI_Comm_size(MPI_COMM_WORLD, &nnprocs);
+			int newhits = datahits + rowshits + colhits;
+			if (myrank == 0) {
+				for(int i = 1; i < nnprocs; ++i) {
+					MPI_Status idc;
+					int recv;
+					MPI_Recv(&recv, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &idc);
+					svdhits += recv;
+					MPI_Recv(&recv, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &idc);
+					newhits += recv;
+				}
+			} else {
+				MPI_Send(&svdhits, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
+				MPI_Send(&newhits, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
+			}
+			allt = j > 0 && svdhits == newhits;
+			if(j == 0) maxhits = newhits;
+			MPI_Bcast(&allt, 1, MPI_INT, 0, MPI_COMM_WORLD);
+			if(allt) {
+				continue;
+			}
+			comms = 0;
+			datahits = 0;
+			rowshits = 0;
+			colhits = 0;
+			commtime = 0;
+			comptime = 0;
+			checkingTime = 0;
+			// std::cout << "Running with tradeoff of " << 100 * j << "KB" << std::endl;
+			MPI_Barrier(MPI_COMM_WORLD);
+			MPI_Pcontrol(1, "SpGEMM_DoubleBuff");
+			t1 = MPI_Wtime(); // initilize (wall-clock) timer
+			
+			for (int i = 0; i < ITERATIONS; i++)
+			{
+				// std::cout << "--------------NEW ITER------------" << std::endl;
+				C = Mult_AnXBn_DoubleBuff_CUDA<PTDOUBLEDOUBLE, double, PSpMat<double>::DCCols>(A, B);
+			}
+			MPI_Barrier(MPI_COMM_WORLD);
+			t2 = MPI_Wtime();
+			MPI_Pcontrol(-1, "SpGEMM_DoubleBuff");
+			commper = 3 * nnprocs * nnprocs;
+			if (myrank == 0 || nprocs == 1)
+			{
+				std::string filename = "output" + Aname.substr(0,Aname.length() - 4) + ".txt";
+				//std::cout << filename.c_str() << std::endl;
+				FILE *f = fopen(filename.c_str(), "a");
+				if(f==NULL){printf("failed to open file: permission issue ?\n");exit(1);}
+				// cout << "Double buffered CUDA multiplications finished" << endl;
+				printf("%i,%i,%i,%.6lf,%.6lf,%.6lf,%.6lf\n", GPUTradeoff / 1024, newhits,maxhits, (t2 - t1) / (double)ITERATIONS, (commtime) / (double)ITERATIONS,comptime / (double) ITERATIONS, checkingTime / (double) ITERATIONS);
+				fprintf(f, "%i,%i,%i,%.6lf,%.6lf,%.6lf\n", GPUTradeoff / 1024, newhits,maxhits, (t2 - t1) / (double)ITERATIONS, (commtime) / (double)ITERATIONS,comptime / (double) ITERATIONS);
+				fclose(f);
+			}
+			if(!COMMTESTON) break;
+			if(!newhits) break;
+			if(nprocs == 1) break;
+		}
+#endif
+	}
+	MPI_Finalize();
+	return 0;
+}
+
+#endif
\ No newline at end of file
diff --git a/ReleaseTests/NWAYSCALE22/btwcent1024.1246100.err b/ReleaseTests/NWAYSCALE22/btwcent1024.1246100.err
index fd21442a..db062968 100644
--- a/ReleaseTests/NWAYSCALE22/btwcent1024.1246100.err
+++ b/ReleaseTests/NWAYSCALE22/btwcent1024.1246100.err
@@ -1,154 +1,154 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i153-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i141-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i109-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i112-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i171-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i153-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i143-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i143-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i107-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i127-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i153-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i141-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i131-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i108-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i107-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i154-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i123-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i107-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i102-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i143-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i142-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i138-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i127-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i153-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i156-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i103-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i153-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i130-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i107-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i127-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i108-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i156-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i169-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i181-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i169-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i109-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i135-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i157-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i138-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i103-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i160-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i174-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i169-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i153-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i141-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i109-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i112-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i153-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i127-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i153-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i141-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i131-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i108-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i154-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i123-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i102-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i142-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i138-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i127-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i153-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i156-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i103-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i153-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i130-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i127-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i108-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i156-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i169-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i181-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i169-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i109-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i135-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i157-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i138-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i103-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i160-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i174-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i169-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/NWAYSCALE22/btwcent1024.1256708.err b/ReleaseTests/NWAYSCALE22/btwcent1024.1256708.err
index b40ad933..4f43ecb7 100644
--- a/ReleaseTests/NWAYSCALE22/btwcent1024.1256708.err
+++ b/ReleaseTests/NWAYSCALE22/btwcent1024.1256708.err
@@ -1,23 +1,23 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i161-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i157-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i103-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i112-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i107-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i135-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i162-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i102-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i157-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i103-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i112-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i135-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i102-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/NWAYSCALE22/btwcent256.1246095.err b/ReleaseTests/NWAYSCALE22/btwcent256.1246095.err
index f877d87b..0f88726b 100644
--- a/ReleaseTests/NWAYSCALE22/btwcent256.1246095.err
+++ b/ReleaseTests/NWAYSCALE22/btwcent256.1246095.err
@@ -1,27 +1,27 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i176-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i102-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i162-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i171-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i127-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i174-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i174-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i109-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i103-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i135-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i131-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i107-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i102-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i127-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i174-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i174-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i109-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i103-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i135-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i131-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/NWAYSCALE22/btwcent256.1246099.err b/ReleaseTests/NWAYSCALE22/btwcent256.1246099.err
index 030fbfa1..c112d6db 100644
--- a/ReleaseTests/NWAYSCALE22/btwcent256.1246099.err
+++ b/ReleaseTests/NWAYSCALE22/btwcent256.1246099.err
@@ -1,54 +1,54 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i172-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i143-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i112-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i108-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i157-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i149-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i143-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i102-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i102-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i154-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i112-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i142-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i127-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i112-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i108-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i157-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i149-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i102-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i102-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i154-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i112-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i142-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i127-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/NWAYSCALE22/btwcent256.1246103.err b/ReleaseTests/NWAYSCALE22/btwcent256.1246103.err
index 2287583e..f42eecb7 100644
--- a/ReleaseTests/NWAYSCALE22/btwcent256.1246103.err
+++ b/ReleaseTests/NWAYSCALE22/btwcent256.1246103.err
@@ -1,80 +1,80 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i124-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i152-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i127-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i157-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i171-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i154-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i127-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i149-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i123-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i138-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i141-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i123-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i162-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i162-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i127-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i160-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i169-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i169-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i102-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i152-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i127-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i157-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i154-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i127-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i149-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i123-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i138-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i141-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i123-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i127-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i160-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i169-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i169-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i102-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/NWAYSCALE22/btwcent64.1246094.err b/ReleaseTests/NWAYSCALE22/btwcent64.1246094.err
index 19543d47..8a442939 100644
--- a/ReleaseTests/NWAYSCALE22/btwcent64.1246094.err
+++ b/ReleaseTests/NWAYSCALE22/btwcent64.1246094.err
@@ -1,8 +1,8 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i152-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i152-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/NWAYSCALE22/btwcent64.1246098.err b/ReleaseTests/NWAYSCALE22/btwcent64.1246098.err
index 23f48f29..0457d036 100644
--- a/ReleaseTests/NWAYSCALE22/btwcent64.1246098.err
+++ b/ReleaseTests/NWAYSCALE22/btwcent64.1246098.err
@@ -1,6 +1,6 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i145-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i169-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i169-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/NWAYSCALE22/btwcent64.1246102.err b/ReleaseTests/NWAYSCALE22/btwcent64.1246102.err
index d84d6b58..759354b2 100644
--- a/ReleaseTests/NWAYSCALE22/btwcent64.1246102.err
+++ b/ReleaseTests/NWAYSCALE22/btwcent64.1246102.err
@@ -1,14 +1,14 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i132-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i135-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i135-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent100.1243955.err b/ReleaseTests/SCALE21RMATRMAT/btwcent100.1243955.err
index 5b98b874..36ccfa98 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent100.1243955.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent100.1243955.err
@@ -1,5 +1,5 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i150-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i131-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i131-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent1024.1243969.err b/ReleaseTests/SCALE21RMATRMAT/btwcent1024.1243969.err
index 86acb051..8e3a6769 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent1024.1243969.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent1024.1243969.err
@@ -1,44 +1,44 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i168-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i162-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i162-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i162-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i141-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i152-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i107-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i169-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i102-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i174-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i174-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i149-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i156-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i152-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i169-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i141-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i152-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i169-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i102-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i174-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i174-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i149-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i156-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i152-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i169-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent121.1243956.err b/ReleaseTests/SCALE21RMATRMAT/btwcent121.1243956.err
index f2c7787a..d3e7c3f8 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent121.1243956.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent121.1243956.err
@@ -1,7 +1,7 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i122-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i109-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i109-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent1225.1243970.err b/ReleaseTests/SCALE21RMATRMAT/btwcent1225.1243970.err
index 95484bd1..a282e989 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent1225.1243970.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent1225.1243970.err
@@ -1,63 +1,63 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i164-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i118-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i131-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i123-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i130-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i181-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i171-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i112-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i154-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i156-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i118-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i107-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i142-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i118-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i131-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i123-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i130-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i181-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i112-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i154-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i156-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i118-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i142-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent144.1243957.err b/ReleaseTests/SCALE21RMATRMAT/btwcent144.1243957.err
index 2d320cb8..0341646b 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent144.1243957.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent144.1243957.err
@@ -1,5 +1,5 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i152-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i152-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent1600.1243972.err b/ReleaseTests/SCALE21RMATRMAT/btwcent1600.1243972.err
index 69c2bd4a..51c6b3bb 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent1600.1243972.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent1600.1243972.err
@@ -1,77 +1,77 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i129-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i123-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i156-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i141-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i109-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i138-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i154-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i103-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i143-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i141-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i131-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i109-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i112-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i160-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i118-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i142-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i141-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i112-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i112-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i102-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i123-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i156-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i141-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i109-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i138-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i154-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i103-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i141-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i131-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i109-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i112-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i160-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i118-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i142-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i141-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i112-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i112-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i102-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent169.1243958.err b/ReleaseTests/SCALE21RMATRMAT/btwcent169.1243958.err
index dc168e3b..86d6ed61 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent169.1243958.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent169.1243958.err
@@ -1,8 +1,8 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i107-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i118-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i118-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent196.1243959.err b/ReleaseTests/SCALE21RMATRMAT/btwcent196.1243959.err
index b719fcd3..9eab4595 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent196.1243959.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent196.1243959.err
@@ -1,9 +1,9 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i116-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i174-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i109-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i174-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i109-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent225.1243960.err b/ReleaseTests/SCALE21RMATRMAT/btwcent225.1243960.err
index 72cfd40c..5435dd41 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent225.1243960.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent225.1243960.err
@@ -1,10 +1,10 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i134-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i149-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i138-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i109-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i149-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i138-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i109-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent25.1243329.err b/ReleaseTests/SCALE21RMATRMAT/btwcent25.1243329.err
index 3f25fddb..a8cdcbff 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent25.1243329.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent25.1243329.err
@@ -1,2 +1,2 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i114-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent256.1243962.err b/ReleaseTests/SCALE21RMATRMAT/btwcent256.1243962.err
index 8daee5b7..eceadf6a 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent256.1243962.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent256.1243962.err
@@ -1,9 +1,9 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i149-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i143-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i149-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent36.1243332.err b/ReleaseTests/SCALE21RMATRMAT/btwcent36.1243332.err
index 943185a4..498c55e2 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent36.1243332.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent36.1243332.err
@@ -1,2 +1,2 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i175-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent441.1243964.err b/ReleaseTests/SCALE21RMATRMAT/btwcent441.1243964.err
index 729e4e0e..10499057 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent441.1243964.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent441.1243964.err
@@ -1,22 +1,22 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i154-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i118-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i118-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i123-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i123-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i109-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i154-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i118-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i118-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i123-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i123-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i109-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent49.1243333.err b/ReleaseTests/SCALE21RMATRMAT/btwcent49.1243333.err
index a6bf664a..8f7353ee 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent49.1243333.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent49.1243333.err
@@ -1,3 +1,3 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i176-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent64.1243953.err b/ReleaseTests/SCALE21RMATRMAT/btwcent64.1243953.err
index 286a18d2..0abbd7de 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent64.1243953.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent64.1243953.err
@@ -1,3 +1,3 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i103-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i143-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i103-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent81.1243954.err b/ReleaseTests/SCALE21RMATRMAT/btwcent81.1243954.err
index 859a5a41..ba156daa 100644
--- a/ReleaseTests/SCALE21RMATRMAT/btwcent81.1243954.err
+++ b/ReleaseTests/SCALE21RMATRMAT/btwcent81.1243954.err
@@ -1,6 +1,6 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0
-Warning: Permanently added 'i132-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i181-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i118-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i181-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i118-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent100.1254792.err b/ReleaseTests/SCALE22RMATRMAT/btwcent100.1254792.err
index 750a79cb..a9ec9e02 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent100.1254792.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent100.1254792.err
@@ -1,4 +1,4 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i146-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent1024.1246086.err b/ReleaseTests/SCALE22RMATRMAT/btwcent1024.1246086.err
index a2817758..8f65f85c 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent1024.1246086.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent1024.1246086.err
@@ -1,33 +1,33 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i144-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i157-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i112-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i171-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i181-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i160-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i153-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i142-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i157-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i112-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i181-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i160-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i153-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i142-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent121.1246077.err b/ReleaseTests/SCALE22RMATRMAT/btwcent121.1246077.err
index e61c7640..0d2f0ad6 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent121.1246077.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent121.1246077.err
@@ -1,5 +1,5 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i143-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i127-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i127-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256722.err b/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256722.err
index ffe42819..52a3260c 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256722.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256722.err
@@ -1,20 +1,20 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i123-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i143-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i149-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i130-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i160-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i109-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i142-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i123-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i149-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i130-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i160-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i109-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i142-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256737.err b/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256737.err
index e853b2ac..10f84598 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256737.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256737.err
@@ -1,29 +1,29 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i137-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i152-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i171-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i149-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i181-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i138-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i138-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i152-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i149-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i181-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i138-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i138-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent144.1246078.err b/ReleaseTests/SCALE22RMATRMAT/btwcent144.1246078.err
index 41626c39..4e5682d4 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent144.1246078.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent144.1246078.err
@@ -1,9 +1,9 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i117-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i118-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i108-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i108-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i118-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i108-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i108-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent1600.1246088.err b/ReleaseTests/SCALE22RMATRMAT/btwcent1600.1246088.err
index 1373701b..9198e022 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent1600.1246088.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent1600.1246088.err
@@ -1,78 +1,78 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i135-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i102-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i127-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i149-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i156-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i112-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i181-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i153-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i138-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i107-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i138-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i157-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i171-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i149-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i171-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i108-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i162-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i143-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i171-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i110-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i103-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i135-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i102-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i127-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i149-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i156-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i112-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i181-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i153-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i138-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i138-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i157-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i149-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i108-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i110-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i103-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent169.1246079.err b/ReleaseTests/SCALE22RMATRMAT/btwcent169.1246079.err
index 8e9b9669..02a0f13d 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent169.1246079.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent169.1246079.err
@@ -1,8 +1,8 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i178-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i141-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i141-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent196.1246080.err b/ReleaseTests/SCALE22RMATRMAT/btwcent196.1246080.err
index 774a6453..2f1fc2fe 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent196.1246080.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent196.1246080.err
@@ -1,13 +1,13 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i163-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i157-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i181-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i154-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i157-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i181-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i154-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent225.1246081.err b/ReleaseTests/SCALE22RMATRMAT/btwcent225.1246081.err
index 7a11bb05..7989a7d9 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent225.1246081.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent225.1246081.err
@@ -1,10 +1,10 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i162-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent36.1246072.err b/ReleaseTests/SCALE22RMATRMAT/btwcent36.1246072.err
index 1a01c0ac..8216fe16 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent36.1246072.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent36.1246072.err
@@ -1,3 +1,3 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i169-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i169-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent441.1246084.err b/ReleaseTests/SCALE22RMATRMAT/btwcent441.1246084.err
index 3aca29a5..7a4d8e91 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent441.1246084.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent441.1246084.err
@@ -1,17 +1,17 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i106-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i130-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i153-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i156-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i112-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i118-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i174-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i130-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i153-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i156-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i112-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i118-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i174-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent49.1246073.err b/ReleaseTests/SCALE22RMATRMAT/btwcent49.1246073.err
index 78e1e9fe..d60854d2 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent49.1246073.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent49.1246073.err
@@ -1,3 +1,3 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i153-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i153-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent625.1246085.err b/ReleaseTests/SCALE22RMATRMAT/btwcent625.1246085.err
index 966d74a2..5cb3683c 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent625.1246085.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent625.1246085.err
@@ -1,20 +1,20 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i130-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i108-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i108-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i130-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i174-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i153-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i153-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i130-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i108-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i108-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i130-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i174-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i153-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i153-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent64.1243334.err b/ReleaseTests/SCALE22RMATRMAT/btwcent64.1243334.err
index 7e554618..9e0fbc4a 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent64.1243334.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent64.1243334.err
@@ -1,3 +1,3 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i157-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i157-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent81.1246075.err b/ReleaseTests/SCALE22RMATRMAT/btwcent81.1246075.err
index 9afcae33..49669163 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent81.1246075.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent81.1246075.err
@@ -1,4 +1,4 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i148-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i128-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i128-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent848.1256710.err b/ReleaseTests/SCALE22RMATRMAT/btwcent848.1256710.err
index 54ae5504..7e59aa24 100644
--- a/ReleaseTests/SCALE22RMATRMAT/btwcent848.1256710.err
+++ b/ReleaseTests/SCALE22RMATRMAT/btwcent848.1256710.err
@@ -1,33 +1,33 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0
-Warning: Permanently added 'i106-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i181-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i119-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i173-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i103-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i181-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i157-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i116-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i159-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i152-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i181-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i119-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i173-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i103-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i181-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i157-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i116-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i159-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i152-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent1024.1258019.err b/ReleaseTests/SCALE23RMATRMAT/btwcent1024.1258019.err
index 9756fcce..8e20c805 100644
--- a/ReleaseTests/SCALE23RMATRMAT/btwcent1024.1258019.err
+++ b/ReleaseTests/SCALE23RMATRMAT/btwcent1024.1258019.err
@@ -1,18 +1,18 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0
-Warning: Permanently added 'i104-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i181-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i147-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i154-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i130-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i160-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i154-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i181-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i147-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i154-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i130-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i160-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i154-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent1225.1256763.err b/ReleaseTests/SCALE23RMATRMAT/btwcent1225.1256763.err
index 9df94063..0225f425 100644
--- a/ReleaseTests/SCALE23RMATRMAT/btwcent1225.1256763.err
+++ b/ReleaseTests/SCALE23RMATRMAT/btwcent1225.1256763.err
@@ -1,34 +1,34 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0
-Warning: Permanently added 'i101-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i179-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i134-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i175-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i154-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i101-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i113-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i104-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i129-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i143-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i142-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i156-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i179-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i134-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i175-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i154-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i101-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i113-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i104-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i129-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i143-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i142-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i156-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent144.1258024.err b/ReleaseTests/SCALE23RMATRMAT/btwcent144.1258024.err
index 821f5473..813b8916 100644
--- a/ReleaseTests/SCALE23RMATRMAT/btwcent144.1258024.err
+++ b/ReleaseTests/SCALE23RMATRMAT/btwcent144.1258024.err
@@ -1,5 +1,5 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0
-Warning: Permanently added 'i136-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i160-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i160-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent1600.1256764.err b/ReleaseTests/SCALE23RMATRMAT/btwcent1600.1256764.err
index ed2e1bfc..432f9f70 100644
--- a/ReleaseTests/SCALE23RMATRMAT/btwcent1600.1256764.err
+++ b/ReleaseTests/SCALE23RMATRMAT/btwcent1600.1256764.err
@@ -1,50 +1,50 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0
-Warning: Permanently added 'i140-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i154-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i107-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i124-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i107-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i125-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i135-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i118-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i171-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i106-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i142-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i169-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i131-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i156-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i141-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i152-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i108-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i154-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i124-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i107-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i125-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i135-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i118-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i142-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i169-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i131-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i156-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i141-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i152-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i108-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent256.1256756.err b/ReleaseTests/SCALE23RMATRMAT/btwcent256.1256756.err
index 81294884..dc0f997e 100644
--- a/ReleaseTests/SCALE23RMATRMAT/btwcent256.1256756.err
+++ b/ReleaseTests/SCALE23RMATRMAT/btwcent256.1256756.err
@@ -1,7 +1,7 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0
-Warning: Permanently added 'i106-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i163-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i106-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i163-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent441.1256758.err b/ReleaseTests/SCALE23RMATRMAT/btwcent441.1256758.err
index 4bdcb9fb..c42de063 100644
--- a/ReleaseTests/SCALE23RMATRMAT/btwcent441.1256758.err
+++ b/ReleaseTests/SCALE23RMATRMAT/btwcent441.1256758.err
@@ -1,17 +1,17 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0
-Warning: Permanently added 'i171-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i148-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i133-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i109-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i180-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i164-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i176-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i151-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i111-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i146-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i140-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i148-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i133-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i109-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i180-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i164-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i176-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i111-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i146-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i140-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent625.1258049.err b/ReleaseTests/SCALE23RMATRMAT/btwcent625.1258049.err
index e200e72c..ec5cd52a 100644
--- a/ReleaseTests/SCALE23RMATRMAT/btwcent625.1258049.err
+++ b/ReleaseTests/SCALE23RMATRMAT/btwcent625.1258049.err
@@ -1,14 +1,14 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0
-Warning: Permanently added 'i162-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i130-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i144-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i170-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i165-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i114-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i135-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i126-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i156-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i142-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i139-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i162-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i130-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i144-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i170-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i165-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i114-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i135-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i126-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i156-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i142-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent64.1256754.err b/ReleaseTests/SCALE23RMATRMAT/btwcent64.1256754.err
index 9fa2573f..6df771ed 100644
--- a/ReleaseTests/SCALE23RMATRMAT/btwcent64.1256754.err
+++ b/ReleaseTests/SCALE23RMATRMAT/btwcent64.1256754.err
@@ -1,3 +1,3 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0
-Warning: Permanently added 'i151-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i108-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i151-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i108-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent848.1256760.err b/ReleaseTests/SCALE23RMATRMAT/btwcent848.1256760.err
index 31325eb2..783078c1 100644
--- a/ReleaseTests/SCALE23RMATRMAT/btwcent848.1256760.err
+++ b/ReleaseTests/SCALE23RMATRMAT/btwcent848.1256760.err
@@ -1,22 +1,22 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0
-Warning: Permanently added 'i139-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i152-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i155-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i177-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i149-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i181-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i171-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i135-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i121-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i117-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i149-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i105-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i174-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i142-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i127-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i168-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i137-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i160-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i139-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i152-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i155-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i177-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i149-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i181-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i171-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i135-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i121-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i117-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i149-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i105-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i174-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i142-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i127-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i168-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i137-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i160-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
diff --git a/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269002.err b/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269002.err
index 787cbd02..21398f2b 100644
--- a/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269002.err
+++ b/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269002.err
@@ -1,23 +1,23 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE26-RMAT/rmat26.txt /work/00919/tg459476/SCALE26-RMAT/fringe_scale26_rect8192_sparse1000
-Warning: Permanently added 'i174-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i152-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i166-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i161-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i130-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i120-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i150-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i132-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i130-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i118-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i136-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i145-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i172-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i167-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i158-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i103-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i122-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i174-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i152-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i166-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i161-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i130-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i120-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i150-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i132-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i130-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i118-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i136-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i145-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i172-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i167-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i158-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i103-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
 [i138-410:31041] *** Process received signal ***
 [i138-410:31041] Signal: Segmentation fault (11)
 [i138-410:31041] Signal code: Address not mapped (1)
diff --git a/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269004.err b/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269004.err
index 9fdf8bb8..ca39c7f1 100644
--- a/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269004.err
+++ b/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269004.err
@@ -1,6 +1,6 @@
 + ibrun ./MultTime /work/00919/tg459476/SCALE26-RMAT/rmat26.txt /work/00919/tg459476/SCALE26-RMAT/fringe_scale26_rect8192_sparse100000
-Warning: Permanently added 'i122-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
-Warning: Permanently added 'i178-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i122-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
+Warning: Permanently added 'i178-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts.
 [i162-410:02581] *** Process received signal ***
 [i162-410:02581] Signal: Bus error (7)
 [i162-410:02581] Signal code:  (2)
diff --git a/compile_commands.json b/compile_commands.json
new file mode 120000
index 00000000..572ddf1d
--- /dev/null
+++ b/compile_commands.json
@@ -0,0 +1 @@
+/jet/home/tmcfarla/CombBLAS/_build/compile_commands.json
\ No newline at end of file
diff --git a/include/CombBLAS/CombBLAS.h b/include/CombBLAS/CombBLAS.h
index 94bbb6c2..fc93cf07 100644
--- a/include/CombBLAS/CombBLAS.h
+++ b/include/CombBLAS/CombBLAS.h
@@ -59,9 +59,9 @@ NOTICE.  This Software was developed under funding from the U.S. Department of E
 
 // Just in case the -fopenmp didn't define _OPENMP by itself
 #ifdef THREADED
-	#ifndef _OPENMP
-	#define _OPENMP
-	#endif
+	//#ifndef _OPENMP
+	//#define _OPENMP
+	//#endif
 #endif
 
 #ifdef _OPENMP
diff --git a/include/CombBLAS/ParFriends.h b/include/CombBLAS/ParFriends.h
index a79c714c..279025eb 100644
--- a/include/CombBLAS/ParFriends.h
+++ b/include/CombBLAS/ParFriends.h
@@ -41,8 +41,22 @@
 #include "OptBuf.h"
 #include "mtSpGEMM.h"
 #include "MultiwayMerge.h"
+#include <memory>
 #include <unistd.h>
 #include <type_traits>
+#include <unordered_set>
+
+#ifdef __CUDACC__
+#include <cuda.h>
+#include "cudaSpGEMM.h"
+#include "../GALATIC/include/dCSR.cuh"
+#include "../GALATIC/include/CSR.cuh"
+#include "../GALATIC/include/SemiRingInterface.h"
+#include "../GALATIC/include/TestSpGEMM.cuh"
+#include "../GALATIC/source/device/Multiply.cuh"
+#endif
+//#include "cudaSpGEMM.cu"
+
 
 namespace combblas {
 
@@ -1253,7 +1267,8 @@ SpParMat<IU,NUO,UDERO> Mult_AnXBn_DoubleBuff
 
 	int stages, dummy; 	// last two parameters of ProductGrid are ignored for Synch multiplication
 	std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy);
-	LIA C_m = A.spSeq->getnrow();
+	
+    LIA C_m = A.spSeq->getnrow();
 	LIB C_n = B.spSeq->getncol();
     
 	UDERA * A1seq = new UDERA();
@@ -1282,8 +1297,11 @@ SpParMat<IU,NUO,UDERO> Mult_AnXBn_DoubleBuff
 	int Aself = (A.commGrid)->GetRankInProcRow();
 	int Bself = (B.commGrid)->GetRankInProcCol();	
 
+    double mpi_overhead = 0.0;
+    
 	for(int i = 0; i < stages; ++i) 
 	{
+       
 		std::vector<LIA> ess;	
 		if(i == Aself)
 		{	
@@ -1350,6 +1368,7 @@ SpParMat<IU,NUO,UDERO> Mult_AnXBn_DoubleBuff
 	// Start the second round
 	for(int i = 0; i < stages; ++i) 
 	{
+       
 		std::vector<LIA> ess;	
 		if(i == Aself)
 		{	
@@ -1443,6 +1462,588 @@ SpParMat<IU,NUO,UDERO> Mult_AnXBn_DoubleBuff
 	return SpParMat<IU,NUO,UDERO> (C, GridC);		// return the result object
 }
 
+#ifdef __CUDACC__
+template <typename NT1, typename NT2, typename NT3, typename sr>
+struct Wrap_SR : SemiRing<NT1, NT2, NT3>
+{
+    __host__ __device__ NT3 multiply(const NT1 &a, const NT2 &b) const { return sr::multiply(a, b); }
+    __host__ __device__ NT3 add(const NT1 &a, const NT2 &b) const { return sr::add(a, b); }
+    __host__ __device__ static double AdditiveIdentity() { return 0; }
+};
+
+
+
+template <typename UDERA, typename NU1>
+void convertCSR(UDERA *ARecv, dCSR<NU1> &input_GPU, int id)
+{
+    typedef typename UDERA::LocalIT LIA;
+    LIA j = 0;
+    unsigned int *rows;
+    cudaMallocHost(&rows, sizeof(unsigned int) * (ARecv->getncol() + 1));
+            HANDLE_ERROR(cudaGetLastError());
+
+    for (LIA i = 0; i <= ARecv->getnzc(); ++i) 
+    {
+        if (i == ARecv->getnzc())
+        {
+            while (j <= ARecv->getncol())
+            {
+                rows[j] = ARecv->getnnz();
+                j++;
+            }
+            break;
+        }
+        unsigned int val = (unsigned int) ARecv->GetDCSC()->cp[i];
+        while (j <= ARecv->GetDCSC()->jc[i] && j <= ARecv->getncol())
+        {
+            rows[j] = val;
+            j++;
+        }
+    }
+            HANDLE_ERROR(cudaGetLastError());
+
+    //std::cout << "STARTING ALLOCING in CONV " << id << std::endl;
+    if(input_GPU.nnz != 0) dealloc(input_GPU);
+    input_GPU.rows = ARecv->getncol();
+    input_GPU.cols = ARecv->getnrow();
+    input_GPU.nnz = ARecv->getnnz();
+            HANDLE_ERROR(cudaGetLastError());
+
+    // std::cout << input_GPU.nnz << std::endl;
+    gpuErrchk(cudaMalloc(&input_GPU.data, sizeof(NU1) * (ARecv->getnnz())));
+    gpuErrchk(cudaMalloc(&input_GPU.col_ids, sizeof(unsigned int) * (ARecv->getnnz())));
+    gpuErrchk(cudaMalloc(&input_GPU.row_offsets, sizeof(unsigned int) * (ARecv->getncol() + 1)));
+    gpuErrchk(cudaDeviceSynchronize());
+    // std::cout << "STARTING COPY " << id << std::endl;
+
+    cudaMemcpy(input_GPU.row_offsets, rows, (input_GPU.rows + 1) * sizeof(unsigned int), cudaMemcpyHostToDevice);
+
+    gpuErrchk(cudaDeviceSynchronize());
+    // std::cout << "CPED ROW/COLS " << id << std::endl;
+    if(ARecv->getnnz() > 0) gpuErrchk(cudaMemcpy(input_GPU.data, ARecv->GetDCSC()->numx, (ARecv->getnnz()) * sizeof(NU1), cudaMemcpyHostToDevice));
+    gpuErrchk(cudaDeviceSynchronize());
+    // std::cout << "CPED NUM " << id << std::endl;
+    if(ARecv->getnnz() > 0) gpuErrchk(cudaMemcpy(input_GPU.col_ids, &(ARecv->GetDCSC()->ir[0]), (ARecv->getnnz()) * sizeof(unsigned int), cudaMemcpyHostToDevice));
+    gpuErrchk(cudaDeviceSynchronize());
+    // std::cout << "DELETING ROWS " << id << std::endl;
+    
+    cudaFreeHost(rows);
+    gpuErrchk(cudaDeviceSynchronize());
+            HANDLE_ERROR(cudaGetLastError());
+
+    // free(rows);
+    
+}
+
+// Workaround for now
+
+
+
+
+struct MinPlusSRingGPU : SemiRing<double, double, double> {
+      __host__ __device__ double multiply(const double& a, const double& b) const { if(a == std::numeric_limits<double>::max() || b == std::numeric_limits<double>::max()) { return std::numeric_limits<double>::max();} else return a + b; }
+  __host__ __device__ double add(const double& a, const double& b)   const   { return std::min(a, b); }
+   __host__ __device__  static double AdditiveIdentity()                  { return  std::numeric_limits<double>::max(); }
+};
+
+typedef Arith_SR ringss;
+Arith_SR sr;
+double comptime = 0;
+template <typename SR, typename NU1, typename NU2, typename NUO>
+CSR<NUO> GPULocalMultiply(dCSR<NU1>& A, dCSR<NU2>& B)
+{
+
+    double t1 = MPI_Wtime();
+    const int Threads = 128;
+    const int BlocksPerMP = 1;
+    const int NNZPerThread = 2;
+    const int InputElementsPerThreads = 2;
+    const int RetainElementsPerThreads = 1;
+    const int MaxChunksToMerge = 16;
+    const int MaxChunksGeneralizedMerge = 512; // MAX: 865
+    const int MergePathOptions = 8;
+    HANDLE_ERROR(cudaGetLastError());
+
+    cudaDeviceSynchronize();
+    SR semiring2;
+    if(A.nnz == 0 || B.nnz == 0) {
+        CSR<NUO> C;
+        C.alloc(A.rows, B.rows, 0);
+        return C;
+    }
+    dCSR<NUO> result_mat_GPU;
+    GPUMatrixMatrixMultiplyTraits DefaultTraits(
+        Threads, BlocksPerMP, NNZPerThread, InputElementsPerThreads,
+        RetainElementsPerThreads, MaxChunksToMerge,
+        MaxChunksGeneralizedMerge, MergePathOptions);
+
+    const bool Debug_Mode = false;
+    // DefaultTraits.preferLoadBalancing = false;
+    ExecutionStats stats;
+    // stats.measure_all = false;
+    HANDLE_ERROR(cudaGetLastError());
+
+    
+    //std::cout << "ENTERED MULT" << std::endl;
+    ACSpGEMM::Multiply<ringss>(
+        A, B, result_mat_GPU,
+        DefaultTraits, stats, Debug_Mode, sr);
+        //std::cout << "EXITED MULT" << std::endl;
+
+
+    gpuErrchk(cudaDeviceSynchronize());
+    HANDLE_ERROR(cudaGetLastError());
+    // std::cout << "DONE" << std::endl;
+    CSR<NUO> result_mat_CPU;
+    size_t it = 0;
+    // std::unordered_set<LIC> nnzc_set;
+    // std::cout << result_mat_GPU.rows << std::endl;
+    convert(result_mat_CPU, result_mat_GPU);
+    //::cout << sizeof(NUO) * result_mat_GPU.nnz << std::endl;
+    //std::cout << sizeof(uint) * result_mat_GPU.rows << std::endl;
+    HANDLE_ERROR(cudaGetLastError());
+    cudaDeviceSynchronize();
+    //cudaFree(result_mat_GPU.data);
+    //cudaFree(result_mat_GPU.col_ids);
+    //cudaFree(result_mat_GPU.row_offsets);
+    HANDLE_ERROR(cudaGetLastError());
+    //result_mat_GPU.reset();
+    cudaDeviceSynchronize();
+    double t2 = MPI_Wtime();
+    comptime += (t2 - t1);
+    HANDLE_ERROR(cudaGetLastError());
+    return result_mat_CPU;
+}
+
+
+int GPUTradeoff = 1024 * 1024;
+/**
+ * Parallel C = A*B routine that uses a double buffered broadcasting scheme, but
+ * this time with CUDA
+ * @pre { Input matrices, A and B, should not alias }
+ * Most memory efficient version available. Total stages: 2*sqrt(p)
+ * Memory requirement during first sqrt(p) stages: <= (3/2)*(nnz(A)+nnz(B))+(1/2)*nnz(C)
+ * Memory requirement during second sqrt(p) stages: <= nnz(A)+nnz(B)+nnz(C)
+ * Final memory requirement: nnz(C) if clearA and clearB are true
+ **/
+double checkingTime = 0;
+template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1,
+          typename NU2, typename UDERA, typename UDERB>
+SpParMat<IU, NUO, UDERO> Mult_AnXBn_DoubleBuff_CUDA(SpParMat<IU, NU1, UDERA> &A,
+                                                    SpParMat<IU, NU2, UDERB> &B,
+                                                    bool clearA = false,
+                                                    bool clearB = false)
+
+{
+            HANDLE_ERROR(cudaGetLastError());
+
+    if (!CheckSpGEMMCompliance(A, B))
+    {
+        return SpParMat<IU, NUO, UDERO>();
+    }
+    typedef typename UDERA::LocalIT LIA;
+    typedef typename UDERB::LocalIT LIB;
+    typedef typename UDERO::LocalIT LIC;
+
+    double over = 0;
+    double t1 = MPI_Wtime();
+    static_assert(
+        std::is_same<LIA, LIB>::value,
+        "local index types for both input matrices should be the same");
+    static_assert(
+        std::is_same<LIA, LIC>::value,
+        "local index types for input and output matrices should be the same");
+
+    int stages, dummy; // last two parameters of ProductGrid are ignored for
+                       // Synch multiplication
+                       int id;
+    MPI_Comm_rank(MPI_COMM_WORLD, &id);
+    ACSpGEMM::id = id;
+    int devices;
+            HANDLE_ERROR(cudaGetLastError());
+
+    cudaGetDeviceCount(&devices);
+    int local_rank, local_size;
+	//MPI_Comm local_comm;
+	//MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, id, MPI_INFO_NULL, &local_comm);
+	//MPI_Comm_size(local_comm, &local_size);
+	//MPI_Comm_rank(local_comm, &local_rank);
+	cudaSetDevice(id % devices);
+    int devs;
+
+    //cudaGetDeviceCount(&devs);
+    //cudaSetDevice(id % devs); // Prevents crashes where processes > # devs
+    std::shared_ptr<CommGrid> GridC = ProductGrid(
+        (A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy);
+    LIA C_m = A.spSeq->getnrow();
+    LIB C_n = B.spSeq->getncol();
+
+    UDERA *A1seq = new UDERA();
+    UDERA *A2seq = new UDERA();
+    UDERB *B1seq = new UDERA();
+    UDERB *B2seq = new UDERB();
+    int Aself = (A.commGrid)->GetRankInProcRow();
+    int Bself = (B.commGrid)->GetRankInProcCol();
+    
+    checkingTime += MPI_Wtime() - t1;
+    
+    (A.spSeq)->Split(*A1seq, *A2seq);
+    const_cast<UDERB *>(B.spSeq)->Transpose();
+    (B.spSeq)->Split(*B1seq, *B2seq);
+    HANDLE_ERROR(cudaGetLastError());
+
+    // std::cout << Aself << " " << Bself << " starting GPU" << std::endl;
+    dCSR<NU1> input_A_GPU;
+    dCSR<NU2> input_B_GPU;
+    Wrap_SR<NU1, NU2, NUO, SR> semiring;
+    HANDLE_ERROR(cudaGetLastError());
+
+    // std::cout << Aself << " " << Bself << " ending cpus" << std::endl;
+
+    gpuErrchk(cudaDeviceSynchronize());
+
+    // Transpose back for the column-by-column algorithm
+    const_cast<UDERB *>(B1seq)->Transpose();
+    const_cast<UDERB *>(B2seq)->Transpose();
+    LIA **ARecvSizes = SpHelper::allocate2D<LIA>(UDERA::esscount, stages);
+    LIB **BRecvSizes = SpHelper::allocate2D<LIB>(UDERB::esscount, stages);
+    
+    SpParHelper::GetSetSizes(*A1seq, ARecvSizes, (A.commGrid)->GetRowWorld());
+    SpParHelper::GetSetSizes(*B1seq, BRecvSizes, (B.commGrid)->GetColWorld());
+
+    // Remotely fetched matrices are stored as pointers
+    UDERA *ARecv;
+    UDERB *BRecv;
+    std::vector<SpTuples<LIC, NUO> *> tomerge;
+    HANDLE_ERROR(cudaGetLastError());
+
+
+    HANDLE_ERROR(cudaGetLastError());
+
+    over += MPI_Wtime() - t1;
+    
+    double mpi_overhead = 0.0;
+
+    for (int i = 0; i < stages; ++i) 
+    {
+        HANDLE_ERROR(cudaGetLastError());
+        double t2 = MPI_Wtime();
+        dCSR<NU1> input_A_recv_GPU;
+        dCSR<NU2> input_B_recv_GPU;
+        std::vector<LIA> ess;
+        if (i == Aself)
+        {
+                convertCSR<UDERA, NU1>(A1seq, input_A_recv_GPU, id);
+                
+        }
+        else
+        {
+
+            ARecv = new UDERA(); // first, create the object
+        }
+        ess.resize(UDERA::esscount);
+        for (int j = 0; j < UDERA::esscount; ++j)
+        {
+            ess[j] = ARecvSizes[j][i]; // essentials of the ith
+                                       // matrix in this row
+        }
+        //std::cout << "STARTING BCAST " << id << std::endl;
+        SpParHelper::BCastMatrixCUDA<uint, NU1>(GridC->GetRowWorld(),
+                                                input_A_recv_GPU, ess, i, GPUTradeoff); // then, receive its elements
+        //std::cout << "ENDING BCAST " << id << std::endl;
+        ess.clear();
+        if (i == Bself)
+        {
+            convertCSR<UDERB, NU2>(B1seq, input_B_recv_GPU, id); // shallow-copy
+        }
+        else
+        {
+
+            BRecv = new UDERB();
+        }
+        ess.resize(UDERB::esscount);
+        for (int j = 0; j < UDERB::esscount; ++j)
+        {
+            ess[j] = BRecvSizes[j][i];
+        }
+        SpParHelper::BCastMatrixCUDA(GridC->GetColWorld(),
+                                     input_B_recv_GPU, ess, i, GPUTradeoff); // then, receive its elements
+        HANDLE_ERROR(cudaGetLastError());
+        //std::cout << "first bcast done for " << id << std::endl;
+        
+        //if(input_B_recv_GPU.nnz == 0 || input_A_recv_GPU.nnz == 0) {
+        //    std::cout << "ZEROOO " << id << std::endl;
+        //    continue;
+        //}
+        //  before activating this remove transposing B1seq
+        /*
+        SpTuples<LIC,NUO> * C_cont = MultiplyReturnTuples<SR, NUO>
+                                        (*ARecv, *BRecv, // parameters
+        themselves false, true,	// transpose information (B is
+        transposed) i != Aself, 	// 'delete A' condition i !=
+        Bself);	// 'delete B' condition
+
+        */
+
+        // load results  onto CPU.
+
+        // double start = MPI_Wtime();
+
+        // std::cout << Aself << " " << Bself << " ending alloc" <<
+        // std::endl; double start = MPI_Wtime(); double t1 =
+        // MPI_Wtime();
+
+        // std::cout << input_A_recv_GPU.rows << std::endl;
+        mpi_overhead += MPI_Wtime() - t2;
+        //std::cout << "mult on " << id << std::endl;
+        //MPI_Barrier(MPI_COMM_WORLD);
+        //MPI_Barrier(MPI_COMM_WORLD);
+        CSR<NUO> result_mat_CPU = GPULocalMultiply<SR, NU1, NU2, NUO>(input_B_recv_GPU, input_A_recv_GPU);
+        
+        cudaDeviceSynchronize();
+        HANDLE_ERROR(cudaGetLastError());
+        //std::cout << "mult off" << id << std::endl;
+        MPI_Barrier(MPI_COMM_WORLD);
+        //over += MPI_Wtime() - t1;
+        //std::cout << "TUPLING " << id << std::endl;
+        //  printf("O = %i\n", C_cont->getnnz());
+        //  mpi_overhead += MPI_Wtime() - start;
+        size_t it = 0;
+        std::tuple<LIC, LIC, NUO> *tuplesC =
+            static_cast<std::tuple<LIC, LIC, NUO> *>(::operator new(
+                sizeof(std::tuple<LIC, LIC, NUO>[result_mat_CPU.nnz])));
+        for (LIC i = 0; i < result_mat_CPU.rows; ++i)
+        {
+
+            for (LIC j = result_mat_CPU.row_offsets[i];
+                 j < result_mat_CPU.row_offsets[i + 1]; ++j)
+            {
+                // nzc_set.insert(result_mat_CPU.col_ids[j]);
+                // std::cout << "IT " << it << " EXCEEDED " <<
+                // result_mat_CPU.nnz <<std::endl;
+                tuplesC[it++] =
+                    std::make_tuple(result_mat_CPU.col_ids[j],
+                                    i, result_mat_CPU.data[j]);
+            }
+        }
+
+        // std::cout << Aself << " " << Bself << " ending tupling " << i;
+        // << std::endl;
+        // load results  onto CPU.
+        SpTuples<LIC, NUO> *C_cont = new SpTuples<LIC, NUO>(
+            result_mat_CPU.nnz, C_m,
+            C_n, tuplesC, false, true);
+        //(*C_cont).PrintInfo();
+        if (i != Aself)
+            delete ARecv;
+            //dealloc(input_A_recv_GPU);
+        
+        if (i != Bself)
+            delete BRecv;
+            //dealloc(input_B_recv_GPU);
+
+        if (!C_cont->isZero())
+            tomerge.push_back(C_cont);
+        else
+            delete C_cont;
+    }
+    HANDLE_ERROR(cudaGetLastError());
+
+    if (clearA)
+        delete A1seq;
+    if (clearB)
+        delete B1seq;
+
+    // Set the new dimensions
+    t1 = MPI_Wtime();
+    //dealloc(input_A_GPU);
+    //dealloc(input_B_GPU);
+    cudaDeviceSynchronize();
+    dCSR<NU1> input_A2_GPU;
+    dCSR<NU2> input_B2_GPU;
+    HANDLE_ERROR(cudaGetLastError());
+
+
+    HANDLE_ERROR(cudaGetLastError());
+
+    SpParHelper::GetSetSizes(*A2seq, ARecvSizes, (A.commGrid)->GetRowWorld());
+    SpParHelper::GetSetSizes(*B2seq, BRecvSizes, (B.commGrid)->GetColWorld());
+    over += MPI_Wtime() - t1;
+
+
+    //std::cout << "S3 " << id << std::endl;
+    for (int i = 0; i < stages; ++i)
+    {
+        double t2 = MPI_Wtime();
+        dCSR<NU1> input_A_recv_GPU;
+        dCSR<NU2> input_B_recv_GPU;
+        // std::cout << Aself << " " << Bself << " starting stage " << i
+        // << std::endl;
+        std::vector<LIA> ess;
+        if (i == Aself)
+        {
+    convertCSR<UDERA, NU1>(A2seq, input_A_recv_GPU, id);
+           }   else
+        {
+
+            ARecv = new UDERA(); // first, create the object
+        }
+        ess.resize(UDERA::esscount);
+        for (int j = 0; j < UDERA::esscount; ++j)
+        {
+            ess[j] = ARecvSizes[j][i]; // essentials of the ith
+                                       // matrix in this row
+        }
+        //std::cout << "STARTING BCAST " << id << std::endl;
+        SpParHelper::BCastMatrixCUDA<uint, NU1>(GridC->GetRowWorld(),
+                                                input_A_recv_GPU, ess, i, GPUTradeoff); // then, receive its elements
+        //std::cout << "ENDING BCAST " << id << std::endl;
+        ess.clear();
+        if (i == Bself)
+        {
+    convertCSR<UDERB, NU2>(B2seq, input_B_recv_GPU, id);         }
+        else
+        {
+
+            BRecv = new UDERB();
+        }
+        ess.resize(UDERB::esscount);
+        for (int j = 0; j < UDERB::esscount; ++j)
+        {
+            ess[j] = BRecvSizes[j][i];
+        }
+        SpParHelper::BCastMatrixCUDA(GridC->GetColWorld(),
+                                     input_B_recv_GPU, ess, i, GPUTradeoff); // then, receive its elements
+
+        // before activating this remove transposing B1seq
+        /*
+        SpTuples<LIC,NUO> * C_cont = MultiplyReturnTuples<SR, NUO>
+                                        (*ARecv, *BRecv, // parameters
+        themselves false, true,	// transpose information (B is
+        transposed) i != Aself, 	// 'delete A' condition i !=
+        Bself);	// 'delete B' condition
+
+        */
+        /* ARecv->Transpose();
+         BRecv->Transpose();
+         SpTuples<LIC,NUO> * C_cont = LocalHybridSpGEMM<SR, NUO>
+                         (*ARecv, *BRecv, // parameters themselves
+                         i != Aself,    // 'delete A' condition
+                         i != Bself);   // 'delete B' condition*/
+        // const_cast< UDERB* >(B.spSeq)->Transpose();
+            HANDLE_ERROR(cudaGetLastError());
+
+        mpi_overhead += MPI_Wtime() - t2;
+        CSR<NUO> result_mat_CPU = GPULocalMultiply<SR, NU1, NU2, NUO>(input_B_recv_GPU, input_A_recv_GPU);
+        gpuErrchk(cudaDeviceSynchronize());
+        HANDLE_ERROR(cudaGetLastError());
+
+        //over += MPI_Wtime() - t1;
+        //std::cout << over << std::endl;
+        // std::cout << "ENDING MULT" << std::endl;
+        // mpi_overhead += MPI_Wtime() - start;
+        // double t2 = MPI_Wtime();
+        // printf("Time for actual mult = %.6lf \n", t2 - t1);
+        size_t it = 0;
+        // std::unordered_set<LIC> nnzc_set;
+        // std::cout << result_mat_GPU.nnz << std::endl;
+        // std::cout << Aself << " " << Bself << " ending GPU " << i <<
+        // std::endl;
+        // printf("OC = %i\n", result_mat_CPU.nnz);
+
+        std::tuple<LIC, LIC, NUO> *tuplesC =
+            static_cast<std::tuple<LIC, LIC, NUO> *>(::operator new(
+                sizeof(std::tuple<LIC, LIC, NUO>[result_mat_CPU.nnz])));
+        for (LIC i = 0; i < result_mat_CPU.rows; ++i)
+        {
+            for (LIC j = result_mat_CPU.row_offsets[i];
+                 j < result_mat_CPU.row_offsets[i + 1]; ++j)
+            {
+                // nzc_set.insert(result_mat_CPU.col_ids[j]);
+                // std::cout << "IT " << it << " EXCEEDED " <<
+                // result_mat_CPU.nnz <<std::endl;
+                tuplesC[it++] =
+                    std::make_tuple(result_mat_CPU.col_ids[j],
+                                    i, result_mat_CPU.data[j]);
+            }
+        }
+
+        // std::cout << Aself << " " << Bself << " ending tupling " << i
+        // << std::endl;
+        // load results  onto CPU.
+        SpTuples<LIC, NUO> *C_cont = new SpTuples<LIC, NUO>(
+            result_mat_CPU.nnz, C_m,
+            C_n, tuplesC, false, true);
+        //(*C_cont).PrintInfo();
+        if (i != Aself)
+            delete ARecv;
+            //dealloc(input_A_recv_GPU);
+
+            
+        if (i != Bself)
+            delete BRecv;
+            //dealloc(input_B_recv_GPU);
+
+        if (!C_cont->isZero())
+            tomerge.push_back(C_cont);
+        else
+            delete C_cont;
+    }
+    t1 = MPI_Wtime();
+    //dealloc(input_A2_GPU);
+    //dealloc(input_B2_GPU);
+    SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
+    SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
+    // A2seq->Transpose();
+    //     B2seq->Transpose();
+    if (clearA)
+    {
+        delete A2seq;
+        delete A.spSeq;
+        A.spSeq = NULL;
+    }
+    else
+    {
+        // A1seq->Transpose();
+        // A2seq->Transpose();
+        (A.spSeq)->Merge(*A1seq, *A2seq);
+        delete A1seq;
+        delete A2seq;
+    }
+    if (clearB)
+    {
+        delete B2seq;
+        delete B.spSeq;
+        B.spSeq = NULL;
+    }
+    else
+    {
+        B1seq->Transpose();
+        B2seq->Transpose();
+        (B.spSeq)->Merge(*B1seq, *B2seq);
+        delete B1seq;
+        delete B2seq;
+        const_cast<UDERB *>(B.spSeq)
+            ->Transpose(); // transpose back to original
+    }
+    //checkingTime += MPI_Wtime() - t1;
+    // printf("%.6lf\n", mpi_overhead);
+    UDERO *C = new UDERO(MergeAll<SR>(tomerge, C_m, C_n, true), false);
+    // printf("Full output has rows = %i, cols = %i, nnz = %i\n", C->getnrow(),
+    // C->getncol(), C->getnnz());
+    cudaDeviceSynchronize();
+    HANDLE_ERROR(cudaGetLastError());
+
+    over += MPI_Wtime() - t1;
+    //std::cout << over << "\n";
+    return SpParMat<IU, NUO, UDERO>(
+        C, GridC); // return the result object	// return the result object
+                HANDLE_ERROR(cudaGetLastError());
+
+}
+
+#endif
+
 /**
  * Parallel A = B*C routine that uses only MPI-1 features
  * Relies on simple blocking broadcast
diff --git a/include/CombBLAS/SequenceHeaps/util.h b/include/CombBLAS/SequenceHeaps/util.h
index 1d4b41aa..e9c146c7 100644
--- a/include/CombBLAS/SequenceHeaps/util.h
+++ b/include/CombBLAS/SequenceHeaps/util.h
@@ -68,11 +68,11 @@
 ////////////// min, max etc. //////////////////////////////////////
 
 #ifndef Max
-#define Max(x,y) ((x)>=(y)?(x):(y))
+//#define Max(x,y) ((x)>=(y)?(x):(y))
 #endif
 
 #ifndef Min
-#define Min(x,y) ((x)<=(y)?(x):(y))
+//#define Min(x,y) ((x)<=(y)?(x):(y))
 #endif
 
 #ifndef Abs
diff --git a/include/CombBLAS/SpDefs.h b/include/CombBLAS/SpDefs.h
index 23c3941f..050d9666 100644
--- a/include/CombBLAS/SpDefs.h
+++ b/include/CombBLAS/SpDefs.h
@@ -118,8 +118,8 @@ Row
 
 
 // force 8-bytes alignment in heap allocated memory
-#ifndef ALIGN
-#define ALIGN 8
+#ifndef ALIGNX
+#define ALIGNX 8
 #endif
 
 #ifndef THRESHOLD
diff --git a/include/CombBLAS/SpImpl.h b/include/CombBLAS/SpImpl.h
index bd0bda94..0c1fe909 100644
--- a/include/CombBLAS/SpImpl.h
+++ b/include/CombBLAS/SpImpl.h
@@ -201,4 +201,4 @@ struct SpImpl<SR,IT,bool, IVT, OVT>	// specialization
 
 #include "SpImpl.cpp"
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/CombBLAS/SpParHelper.cpp b/include/CombBLAS/SpParHelper.cpp
index f2198294..61ca0cbd 100644
--- a/include/CombBLAS/SpParHelper.cpp
+++ b/include/CombBLAS/SpParHelper.cpp
@@ -600,6 +600,101 @@ void SpParHelper::BCastMatrix(MPI_Comm & comm1d, SpMat<IT,NT,DER> & Matrix, cons
 	}			
 }
 
+/**
+  * @param[in] Matrix {For the root processor, the local object to be sent to all others.
+  * 		For all others, it is a (yet) empty object to be filled by the received data}
+  * @param[in] essentials {irrelevant for the root}
+ **/
+
+#ifdef __CUDACC__
+
+double commtime = 0;
+int comms = 0;
+int datahits = 0;
+
+int rowshits = 0;
+
+int colhits = 0;
+
+
+template<typename IT, typename NT>	
+void SpParHelper::BCastMatrixCUDA(MPI_Comm & comm1d, dCSR<NT> & Matrix, const std::vector<IT> & essentials, int root, int GPUTradeoff)
+{
+	comms += 1;
+	double t1 = MPI_Wtime();
+	cudaDeviceSynchronize();
+	int myrank;
+	MPI_Comm_rank(comm1d, &myrank);
+	if(myrank != root)
+	{
+		Matrix.alloc(essentials[2],essentials[1],essentials[0],true);		
+	}
+	
+	//if(sizeof(uint)*(Matrix.nnz) <= 32000) std::cout << "UNDER COLS" << std::endl;
+	//if(sizeof(NT)*(Matrix.nnz) <= 32000) std::cout << "UNDER DATA" << std::endl;
+	//std::cout << myrank << " " <<  Matrix.rows << " " << Matrix.cols << " " << Matrix.nnz << std::endl;
+	cudaDeviceSynchronize();
+	//std::cout << myrank << " BCASTING FIRST FROM " << root << std::endl;
+	//if(!essentials[0]) return;
+	//size_t free;
+	//size_t total;
+	//cudaMemGetInfo(&free, &total);
+	//std::cout << myrank << " has " << free << " of " << total << std::endl;
+	//int GPUTradeoff = 1024 * 1024;
+	//std::cout << GPUTradeoff << std::endl;
+	if(sizeof(uint)*(Matrix.rows + 1) >= GPUTradeoff) {
+		rowshits += 1;
+		MPI_Bcast(Matrix.row_offsets, Matrix.rows + 1, MPIType<uint>(), root, comm1d);
+	} else {
+		uint* temp = (uint*) malloc(sizeof(uint)*(Matrix.rows + 1));
+		if(myrank == root) cudaMemcpy(temp, Matrix.row_offsets, (Matrix.rows + 1)*sizeof(uint), cudaMemcpyDeviceToHost);
+		cudaDeviceSynchronize();
+		MPI_Bcast(temp, Matrix.rows + 1, MPIType<uint>(), root, comm1d);
+		cudaDeviceSynchronize();
+		if(myrank != root) cudaMemcpy(Matrix.row_offsets, temp, (Matrix.rows + 1)*sizeof(uint), cudaMemcpyHostToDevice);
+		free(temp);
+	}
+	
+	cudaDeviceSynchronize();
+	//std::cout << myrank << " BCASTING SECOND" << std::endl;
+	if(sizeof(uint)*(Matrix.nnz) >= GPUTradeoff) {
+		colhits += 1;
+		MPI_Bcast(Matrix.col_ids, Matrix.nnz, MPIType<uint>(), root, comm1d);
+	} else {
+		//std::cout << "ACTIVATED WOOHOO" << std::endl;
+		uint* temp = (uint*) malloc(sizeof(uint)*Matrix.nnz);
+		if(myrank == root) cudaMemcpy(temp, Matrix.col_ids, Matrix.nnz*sizeof(uint), cudaMemcpyDeviceToHost);
+		cudaDeviceSynchronize();
+		MPI_Bcast(temp, Matrix.nnz, MPIType<uint>(), root, comm1d);
+		cudaDeviceSynchronize();
+		if(myrank != root) cudaMemcpy(Matrix.col_ids, temp, Matrix.nnz*sizeof(uint), cudaMemcpyHostToDevice);
+		free(temp);
+		//MPI_Bcast(Matrix.col_ids, Matrix.nnz, MPIType<uint>(), root, comm1d);
+	}
+	
+	cudaDeviceSynchronize();
+	//std::cout << "BCASTING 2 " << myrank << std::endl;
+	if(sizeof(NT)*(Matrix.nnz) >= GPUTradeoff) {
+		datahits += 1;
+		MPI_Bcast(Matrix.data, Matrix.nnz, MPIType<NT>(), root, comm1d);	
+	} else {
+		//std::cout << "WE ARE ON" << std::endl;
+		NT* temp = (NT*) malloc(sizeof(NT)*Matrix.nnz);
+		if(myrank == root) cudaMemcpy(temp, Matrix.data, Matrix.nnz*sizeof(NT), cudaMemcpyDeviceToHost);
+		cudaDeviceSynchronize();
+		MPI_Bcast(temp, Matrix.nnz, MPIType<NT>(), root, comm1d);
+		cudaDeviceSynchronize();
+		if(myrank != root) cudaMemcpy(Matrix.data, temp, Matrix.nnz*sizeof(NT), cudaMemcpyHostToDevice);
+		free(temp);
+	}
+	
+	cudaDeviceSynchronize();
+	//std::cout << "BCAST DONE " << myrank << std::endl;
+	commtime += MPI_Wtime() - t1;
+}
+
+#endif
+
 /**
   * @param[in] Matrix {For the root processor, the local object to be sent to all others.
   * 		For all others, it is a (yet) empty object to be filled by the received data}
diff --git a/include/CombBLAS/SpParHelper.h b/include/CombBLAS/SpParHelper.h
index 840a09f4..e43cd30b 100644
--- a/include/CombBLAS/SpParHelper.h
+++ b/include/CombBLAS/SpParHelper.h
@@ -42,6 +42,7 @@
 #include "MPIType.h"
 #include "SpDefs.h"
 #include "psort/psort.h"
+#include "../GALATIC/include/dCSR.cuh"
 
 namespace combblas {
 
@@ -80,6 +81,11 @@ class SpParHelper
 	template<typename IT, typename NT, typename DER>	
 	static void BCastMatrix(MPI_Comm & comm1d, SpMat<IT,NT,DER> & Matrix, const std::vector<IT> & essentials, int root);
 
+#ifdef __CUDACC__
+	template<typename IT, typename NT>	
+	static void BCastMatrixCUDA(MPI_Comm & comm1d, dCSR<NT> & Matrix, const std::vector<IT> & essentials, int root, int GPUTradeoff=1024*1024);
+#endif
+
 	template<typename IT, typename NT, typename DER>	
 	static void IBCastMatrix(MPI_Comm & comm1d, SpMat<IT,NT,DER> & Matrix, const std::vector<IT> & essentials, int root, std::vector<MPI_Request> & indarrayReq , std::vector<MPI_Request> & numarrayReq);
     
diff --git a/include/CombBLAS/SpParMat.cpp b/include/CombBLAS/SpParMat.cpp
index 70b213a4..bd418921 100644
--- a/include/CombBLAS/SpParMat.cpp
+++ b/include/CombBLAS/SpParMat.cpp
@@ -92,7 +92,7 @@ SpParMat< IT,NT,DER >::SpParMat ()
 	assert( (sizeof(IT) >= sizeof(typename DER::LocalIT)) );
 	spSeq = new DER();
 	commGrid.reset(new CommGrid(MPI_COMM_WORLD, 0, 0));
-}
+} 
 
 /**
 * If there is a single file read by the master process only, use this and then call ReadDistribute()
diff --git a/include/CombBLAS/SpParMat.h b/include/CombBLAS/SpParMat.h
index 1d0880b8..62cf179d 100644
--- a/include/CombBLAS/SpParMat.h
+++ b/include/CombBLAS/SpParMat.h
@@ -323,6 +323,10 @@ class SpParMat
 	template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDER1, typename UDER2> 
 	friend SpParMat<IU, NUO, UDERO> 
 	Mult_AnXBn_DoubleBuff (SpParMat<IU,NU1,UDER1> & A, SpParMat<IU,NU2,UDER2> & B, bool clearA, bool clearB);
+	
+	template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDER1, typename UDER2> 
+	friend SpParMat<IU, NUO, UDERO> 
+	Mult_AnXBn_DoubleBuff_CUDA (SpParMat<IU,NU1,UDER1> & A, SpParMat<IU,NU2,UDER2> & B, bool clearA, bool clearB);
 
 	template <typename SR, typename NUO, typename UDERO, typename IU, typename NU1, typename NU2, typename UDER1, typename UDER2> 
 	friend SpParMat<IU,NUO,UDERO> 
diff --git a/include/CombBLAS/cudaSpGEMM.cu b/include/CombBLAS/cudaSpGEMM.cu
new file mode 100644
index 00000000..267651da
--- /dev/null
+++ b/include/CombBLAS/cudaSpGEMM.cu
@@ -0,0 +1,147 @@
+
+
+#include "cudaSpGEMM.h"
+#include <cstdint>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+#include <thrust/device_vector.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include "../GALATIC/include/CSR.cuh"
+#include "../GALATIC/include/dCSR.cuh"
+
+//#include "../GALATIC/source/device/Multiply.cuh"
+
+template <typename NTO, typename IT, typename NT1, typename NT2>
+__global__ void transformColumn_d(IT A_nzc, IT* A_Tran_CP,
+    IT* A_Tran_IR,
+    IT* A_Tran_JC,
+    NT1* A_Tran_numx,
+    IT* B_CP,
+    IT* B_IR,
+    IT* B_JC,
+    NT2* B_numx,
+    std::tuple<IT,IT,NTO> * tuplesC, IT* curptrC, IT B_nzc) {
+        for(size_t i = blockIdx.x; i < B_nzc; i += gridDim.x) {
+            size_t nnzcolB = B_CP[i+1] - B_CP[i];
+                //if(j == 0) printf("BlockDim = %i, GridDim = %i", blockDim.x, gridDim.x);
+                for(size_t j = threadIdx.x; j < A_nzc; j += blockDim.x) {
+                bool made = false;
+                size_t r = A_Tran_CP[j];
+                uint ptr = curptrC[i];
+                for (size_t k = 0; k < nnzcolB; ++k) {
+                    
+                    while (r < A_Tran_CP[j + 1] && B_IR[B_CP[i]+k] > A_Tran_IR[r]) { 
+                        r++;
+                    }
+                    if (r >= A_Tran_CP[j + 1]) {
+                            break;
+                        }
+                    if (B_IR[B_CP[i]+k] == A_Tran_IR[r]) {
+                        NTO mrhs = A_Tran_numx[r] * B_numx[B_CP[i]+k];
+                        if(true) {
+                            if (made) {
+                                std::get<2>(tuplesC[ptr]) = std::get<2>(tuplesC[ptr]) + mrhs;
+                            } else {
+                                made = true;
+                                ptr = atomicAdd((unsigned long long*) &curptrC[i],(unsigned long long) 1);
+                                //if (colptr_size_d[i] != ptr - curptrC[i]) printf("Potential conflict\n");
+                                //__syncthreads();
+                                //printf("Adding at ptr = %i\n", (int) ptr);
+                               // colptr_size_d[i]++;
+                                std::get<0>(tuplesC[ptr]) = A_Tran_JC[j];
+                                //if (A_Tran_JC[j] < 0 || B_JC[i] < 0) {
+                                //    printf("Somehow got a <0, %i, %i", (int) A_Tran_JC[j], (int) B_JC[i]);
+                                //}
+                                std::get<1>(tuplesC[ptr])= B_JC[i];
+                                std::get<2>(tuplesC[ptr])  = mrhs;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+}
+template < typename NTO, typename IT, typename NT1, typename NT2>
+void transformColumn(IT A_nzc, IT* A_Tran_CP,
+    IT* A_Tran_IR,
+    IT* A_Tran_JC,
+    NT1* A_Tran_numx,
+    IT* B_CP,
+    IT* B_IR,
+    IT* B_JC,
+    NT2* B_numx,
+     std::tuple<IT,IT,NTO> * tuplesC_d, IT* curptrC, IT B_nzc) {
+        int blks = std::min(65535,(int) B_nzc);
+        transformColumn_d<<<blks,256>>>(A_nzc, A_Tran_CP,
+    A_Tran_IR,
+    A_Tran_JC,
+     A_Tran_numx,
+    B_CP,
+B_IR,
+    B_JC,
+     B_numx,
+    tuplesC_d, curptrC, B_nzc);
+}
+
+template void transformColumn< double, int64_t, double, double>(
+   int64_t A_nzc, int64_t* A_Tran_CP,
+    int64_t* A_Tran_IR,
+    int64_t* A_Tran_JC,
+    double* A_Tran_numx,
+    int64_t* B_CP,
+    int64_t* B_IR,
+    int64_t* B_JC,
+    double* B_numx,
+    std::tuple<int64_t,int64_t,double> * tuplesC_d, int64_t* curptrC, int64_t B_nzc);
+
+template <typename Arith_SR, typename NTO, typename NT1, typename NT2, typename IT>
+__host__  CSR<NTO> LocalGalaticSPGEMM
+(CSR<NT1> input_A_CPU,
+CSR<NT2> input_B_CPU,
+ bool clearA, bool clearB, Arith_SR semiring, IT * aux = nullptr) {
+ /*   dCSR<NT1> input_A_GPU;
+dCSR<NT2> input_B_GPU;
+
+dCSR<NTO> result_mat_GPU;
+convert(input_A_GPU, input_A_CPU);
+convert(input_B_GPU, input_B_CPU);
+
+// load data into semiring struct. For this one, we don't need to do anything,
+// but you still need to pass it in for generality. The cost is trivial.
+
+
+// Setup execution options, we'll skip the details for now.
+
+const int Threads = 256;
+const int BlocksPerMP = 1;
+const int NNZPerThread = 2;
+const int InputElementsPerThreads = 2;
+const int RetainElementsPerThreads = 1;
+const int MaxChunksToMerge = 16;
+const int MaxChunksGeneralizedMerge = 256; // MAX: 865
+const int MergePathOptions = 8;
+
+
+GPUMatrixMatrixMultiplyTraits  DefaultTraits(Threads, BlocksPerMP, NNZPerThread,
+                                             InputElementsPerThreads, RetainElementsPerThreads,
+                                             MaxChunksToMerge, MaxChunksGeneralizedMerge, MergePathOptions);
+
+const bool Debug_Mode = true;
+DefaultTraits.preferLoadBalancing = true;
+ExecutionStats stats;
+stats.measure_all = false;
+
+// Actually perform the matrix multiplicaiton
+//ACSpGEMM::Multiply<Arith_SR>(input_A_GPU, input_B_GPU, result_mat_GPU, DefaultTraits, stats, Debug_Mode, semiring);
+
+CSR<NTO> result_mat_CPU;
+// load results  onto CPU.
+convert(result_mat_CPU, result_mat_GPU);
+return result_mat_CPU;*/
+ }
+
+template CSR<double> LocalGalaticSPGEMM<Arith_SR, double, double, double, int64_t>
+(CSR<double> input_A_CPU,
+CSR<double> input_B_CPU,
+ bool clearA, bool clearB, Arith_SR semiring, int64_t * aux = nullptr);
diff --git a/include/CombBLAS/cudaSpGEMM.h b/include/CombBLAS/cudaSpGEMM.h
new file mode 100644
index 00000000..af2fa164
--- /dev/null
+++ b/include/CombBLAS/cudaSpGEMM.h
@@ -0,0 +1,31 @@
+#ifndef _cudaSpGEMM_h
+#define _cudaSpGEMM_h
+
+#include "../GALATIC/include/CSR.h"
+#include "../GALATIC/include/CSR.cuh"
+#include "../GALATIC/include/SemiRingInterface.h"
+#include <tuple>
+
+struct Arith_SR : SemiRing<double, double, double>
+{
+  __host__ __device__ double multiply(const double& a, const double& b) const { return a * b; }
+  __host__ __device__ double add(const double& a, const double& b)   const   { return a + b; }
+   __host__ __device__  static double AdditiveIdentity()                  { return     0; }
+};
+    template < typename NTO, typename IT, typename NT1, typename NT2>
+void transformColumn(IT A_nzc, IT* A_Tran_CP,
+    IT* A_Tran_IR,
+    IT* A_Tran_JC,
+    NT1* A_Tran_numx,
+    IT* B_CP,
+    IT* B_IR,
+    IT* B_JC,
+    NT2* B_numx,
+     std::tuple<IT,IT,NTO> * tuplesC_d, IT* curptrC, IT B_nzc);
+
+template <typename Arith_SR, typename NTO, typename NT1, typename NT2, typename IT>
+CSR<NTO> LocalGalaticSPGEMM
+(CSR<NT1> input_A_CPU,
+CSR<NT2> input_B_CPU,
+ bool clearA, bool clearB, Arith_SR semiring, IT * aux = nullptr);
+#endif
diff --git a/include/CombBLAS/mtSpGEMM.h b/include/CombBLAS/mtSpGEMM.h
index 356f3b16..a708b138 100644
--- a/include/CombBLAS/mtSpGEMM.h
+++ b/include/CombBLAS/mtSpGEMM.h
@@ -2,7 +2,33 @@
 #define _mtSpGEMM_h
 
 #include "CombBLAS.h"
+#include <tuple>
 
+#ifdef GPU_ENABLED
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "cudaSpGEMM.h"
+#endif
+
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess) 
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+#define CHECK_CUSPARSE(func)                                                   \
+{                                                                              \
+    cusparseStatus_t status = (func);                                          \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
+               __LINE__, cusparseGetErrorString(status), status);              \
+        return EXIT_FAILURE;                                                   \
+    }                                                                          \
+}
 namespace combblas {
 /*
  Multithreaded prefix sum
@@ -459,6 +485,208 @@ SpTuples<IT, NTO> * LocalHybridSpGEMM
     return spTuplesC;
 }
 
+#ifdef GPU_ENABLED
+
+
+
+    template <typename SR, typename NTO, typename IT, typename NT1, typename NT2>
+ SpTuples<IT, NTO>* LocalHybridSpGEMM_CUDA
+(const SpDCCols<IT, NT1> & A,
+ const SpDCCols<IT, NT2> & B,
+ bool clearA, bool clearB, IT * aux = nullptr)
+{
+
+
+    IT mdim = A.getnrow();
+    IT ndim = B.getncol();
+    IT nnzA = A.getnnz();
+    if(A.isZero() || B.isZero())
+    {
+        return new SpTuples<IT, NTO>(0, mdim, ndim);
+    }
+	
+	
+    Dcsc<IT,NT1>* Adcsc = A.GetDCSC();
+    Dcsc<IT,NT2>* Bdcsc = B.GetDCSC();
+    IT nA = A.getncol();
+    float cf  = static_cast<float>(nA+1) / static_cast<float>(Adcsc->nzc);
+    IT csize = static_cast<IT>(ceil(cf));   // chunk size
+    bool deleteAux = false;
+    if(aux==nullptr)
+    {
+	deleteAux = true;
+    	Adcsc->ConstructAux(nA, aux);
+    }
+	
+    int numThreads = 1;
+#ifdef THREADED
+#pragma omp parallel
+    {
+        numThreads = omp_get_num_threads();
+    }
+#endif
+
+    IT* flopC =  estimateFLOP(A, B, aux);
+
+
+    IT* colnnzC = estimateNNZ_Hash(A, B, flopC, aux);
+    IT* flopptr = prefixsum<IT>(flopC, Bdcsc->nzc, numThreads);
+    IT flop = flopptr[Bdcsc->nzc];
+    IT* colptrC = prefixsum<IT>(colnnzC, Bdcsc->nzc, numThreads);
+    delete [] colnnzC;
+    delete [] flopC;
+    IT nnzc = colptrC[Bdcsc->nzc];
+
+    
+   std::tuple<IT,IT,NTO> * tuplesC = static_cast<std::tuple<IT,IT,NTO> *> (::operator new (sizeof(std::tuple<IT,IT,NTO>[nnzc])));
+       
+    std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
+   
+     std::vector<std::vector< std::pair<IT,NTO>>> globalHashVecAll(numThreads); 
+     std::vector<std::vector< HeapEntry<IT,NT1>>> globalHeapVecAll(numThreads);
+
+
+
+    SpDCCols<IT, NT1> A_Tran = A.TransposeConst();
+    SpDCCols<IT, NT1> B_Tran = B.TransposeConst();
+
+    Dcsc<IT,NT1>* Adcsc_Tran = A_Tran.GetDCSC();
+    IT* A_Tran_CP;
+    IT* A_Tran_IR;
+    IT* A_Tran_JC;
+    NT1* A_Tran_numx;
+    IT* B_CP;
+    IT* B_IR;
+    IT* B_JC;
+    NT2* B_numx;
+    std::tuple<IT,IT,NTO> * tuplesC_d;
+    IT * tuplesC_d_o;
+    IT * tuplesC_d_t;
+    NTO * tuplesC_d_th;
+    uint * colptr_size_d;
+    uint* curptr_d;
+    IT * colptrC_d;
+    cudaMalloc((void**) &curptr_d, sizeof(uint));
+    cudaMalloc((void**) &tuplesC_d_o, (sizeof(IT[nnzc])));
+    cudaMalloc((void**) &tuplesC_d_t, (sizeof(IT[nnzc])));
+    cudaMalloc((void**) &tuplesC_d_th, (sizeof(NTO[nnzc])));
+    cudaMalloc((void**) &tuplesC_d, (sizeof(std::tuple<IT,IT,NTO>[nnzc])));
+    cudaMalloc((void**) &colptr_size_d, (sizeof(uint[Bdcsc->nzc])));
+    cudaMemset(colptr_size_d, 0, sizeof(uint[Bdcsc->nzc]));
+    cudaMalloc((void**) &A_Tran_CP, sizeof(IT[Adcsc_Tran->nzc + 1]));
+    cudaMalloc((void**) &A_Tran_IR, sizeof(IT[Adcsc_Tran->nz]));
+    cudaMalloc((void**) &A_Tran_JC, sizeof(IT[Adcsc_Tran->nzc]));
+    cudaMalloc((void**) &A_Tran_numx, sizeof(NT1[Adcsc_Tran->nz]));
+    cudaMalloc((void**) &B_CP, sizeof(IT[Bdcsc->nzc + 1]));
+    cudaMalloc((void**) &B_IR, sizeof(IT[Bdcsc->nz]));
+    cudaMalloc((void**) &B_JC, sizeof(IT[Bdcsc->nzc]));
+    cudaMalloc((void**) &B_numx, sizeof(NT2[Bdcsc->nz]));
+    cudaMalloc((void**) &colptrC_d, sizeof(IT[Bdcsc->nzc]));
+    cudaMemcpy(colptrC_d, colptrC, sizeof(IT[Bdcsc->nzc]), cudaMemcpyHostToDevice);
+    cudaMemcpy(A_Tran_CP, Adcsc_Tran->cp, sizeof(IT[Adcsc_Tran->nzc + 1]), cudaMemcpyHostToDevice);
+    cudaMemcpy(A_Tran_IR, Adcsc_Tran->ir, sizeof(IT[Adcsc_Tran->nz]), cudaMemcpyHostToDevice);
+    cudaMemcpy(A_Tran_JC, Adcsc_Tran->jc, sizeof(IT[Adcsc_Tran->nzc]), cudaMemcpyHostToDevice);
+    cudaMemcpy(A_Tran_numx, Adcsc_Tran->numx, sizeof(NT1[Adcsc_Tran->nz]), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_CP, Bdcsc->cp, sizeof(IT[Bdcsc->nzc + 1]), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_IR, Bdcsc->ir, sizeof(IT[Bdcsc->nz]), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_JC, Bdcsc->jc, sizeof(IT[Bdcsc->nzc]), cudaMemcpyHostToDevice);
+    cudaMemcpy(B_numx, Bdcsc->numx, sizeof(NT1[Bdcsc->nz]), cudaMemcpyHostToDevice);
+/*#ifdef THREADED
+#pragma omp parallel for
+#endif
+    for(size_t i=0; i < Bdcsc->nzc; ++i)
+    {
+        size_t nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i]; //nnz in the current column of B
+        int myThread = 0;
+
+#ifdef THREADED
+        myThread = omp_get_thread_num();
+#endif
+            uint* curptr = new uint;
+            *curptr = colptrC[i];
+            cudaMemcpy(curptr_d, curptr, sizeof(uint), cudaMemcpyHostToDevice);
+            delete curptr;
+            //uint curptr = colptrC[i];
+            /*for(size_t j = 0; j < Adcsc_Tran->nzc; ++j) {
+                bool made = false;
+                size_t r = Adcsc_Tran->cp[j];
+                for (size_t k = 0; k < nnzcolB; ++k) {
+                    while (r < Adcsc_Tran->cp[j + 1] && Bdcsc->ir[Bdcsc->cp[i]+k] > Adcsc_Tran->ir[r]) { 
+                        r++;
+                    }
+                    if (r >= Adcsc_Tran->cp[j + 1]) {
+                            break;
+                        }
+                    if (Bdcsc->ir[Bdcsc->cp[i]+k] == Adcsc_Tran->ir[r]) {
+                        NTO mrhs = Adcsc_Tran->numx[r] * Bdcsc->numx[Bdcsc->cp[i]+k];
+                        if(true) {
+                            if (made) {
+                                std::get<2>(tuplesC[curptr - 1]) = std::get<2>(tuplesC[curptr - 1]) + mrhs;
+                            } else {
+                                made = true;
+                                //tuplesC[curptr++] = std::make_tuple(Adcsc_Tran->jc[j], Bdcsc->jc[i], mrhs);
+                                std::get<0>(tuplesC[curptr]) = Adcsc_Tran->jc[j];
+                                std::get<1>(tuplesC[curptr]) = Bdcsc->jc[i];
+                                std::get<2>(tuplesC[curptr++]) = mrhs;
+                            }
+                        }
+                    }
+                }
+            }
+              //cudaDeviceSynchronize();
+    }*/
+
+    transformColumn(Adcsc_Tran->nzc, A_Tran_CP, A_Tran_IR, A_Tran_JC, A_Tran_numx, B_CP, B_IR, B_JC, B_numx, tuplesC_d, colptrC_d, Bdcsc->nzc);
+    
+    
+    if(clearA)
+        delete const_cast<SpDCCols<IT, NT1> *>(&A);
+    if(clearB)
+        delete const_cast<SpDCCols<IT, NT2> *>(&B);
+    
+    
+    
+    if(deleteAux)
+    	delete [] aux;
+    //std::cout << "Made it to receive" << std::endl;
+    IT * tuplesC_o = static_cast<IT *> (::operator new (sizeof(IT[nnzc])));
+    IT * tuplesC_t = static_cast<IT *> (::operator new (sizeof(IT[nnzc])));
+    NTO * tuplesC_th = static_cast<NTO *> (::operator new (sizeof(NTO[nnzc])));
+    
+    uint * colptr_size = static_cast<uint *> (::operator new (sizeof(uint[Bdcsc->nzc])));
+    cudaMemcpy(tuplesC, tuplesC_d, sizeof(std::tuple<IT,IT,NTO>[nnzc]), cudaMemcpyDeviceToHost);
+    gpuErrchk( cudaPeekAtLastError() );
+gpuErrchk( cudaDeviceSynchronize() );
+    /*std::cout << "Made it to loop" << std::endl;
+   #ifdef THREADED
+#pragma omp parallel for
+#endif
+    for (IT i = 0; i < Bdcsc -> nzc; ++i) {
+        //std::cout << "Getting: " << i << std::endl;
+        for (IT j = 0; j < colptr_size[i]; ++j) {
+            IT in = colptrC[i] + j;
+            //std::cout << "Grabbed: " << j << " with " << in << std::endl;
+            tuplesC[in] = std::make_tuple(tuplesC_o[in], tuplesC_t[in], tuplesC_th[in]);
+            //printf("Made tuple at in %i, with values %i, %i, and %i", in, tuplesC_o[in], tuplesC_t[in], tuplesC_th[in]);
+            //std::cout << "Built!" << std::endl;
+            //std::cout << "Done" <<std::endl;
+        }
+    }*/
+    delete [] colptrC;
+    delete [] flopptr;
+    delete [] tuplesC_o;
+    delete [] tuplesC_t;
+    delete [] tuplesC_th;
+    delete [] colptr_size;
+    //std::cout << "Made it to build" << std::endl;
+    SpTuples<IT, NTO>* spTuplesC = new SpTuples<IT, NTO> (nnzc, mdim, ndim, tuplesC, false, true);
+
+    //std::cout << "Made it to return" << std::endl;
+    // std::cout << "localspgemminfo," << flop << "," << nnzc << "," << compression_ratio << "," << t1-t0 << std::endl;
+    // std::cout << hashSelected << ", " << Bdcsc->nzc << ", " << (float)hashSelected / Bdcsc->nzc << std::endl;
+    return spTuplesC;
+}
+#endif
     // Hybrid approach of multithreaded HeapSpGEMM and HashSpGEMM
     template <typename SR, typename NTO, typename IT, typename NT1, typename NT2>
     SpTuples<IT, NTO> * LocalSpGEMMHash
diff --git a/include/GALATIC/GALATICMinimumIncludes.cuh b/include/GALATIC/GALATICMinimumIncludes.cuh
new file mode 100644
index 00000000..1194cd73
--- /dev/null
+++ b/include/GALATIC/GALATICMinimumIncludes.cuh
@@ -0,0 +1,4 @@
+#pragma once
+#include "../../../ext/GALATIC/include/dCSR.cuh"
+#include "../../../ext/GALATIC/include/SemiRingInterface.h"
+#include "../../../ext/GALATIC/source/device/Multiply.cuh"
diff --git a/include/GALATIC/LICENSE b/include/GALATIC/LICENSE
new file mode 100644
index 00000000..16907429
--- /dev/null
+++ b/include/GALATIC/LICENSE
@@ -0,0 +1,33 @@
+*** License Agreement ***
+
+MIT License
+
+GALATIC: GPU Accelerated Sparse Matrix Multiplication over Arbitrary
+Semirings (GALATIC) Copyright (c) 2020-2021, The Regents of the
+University of California, through Lawrence Berkeley National Laboratory
+(subject to receipt of any required approvals from the U.S. Dept. of Energy),
+Richard Lettich, and GPUPeople. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy 
+of this software and associated documentation files (the "Software"), to deal 
+in the Software without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the 
+Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+
+SOFTWARE.
diff --git a/include/GALATIC/README.md b/include/GALATIC/README.md
new file mode 100644
index 00000000..5b79b58a
--- /dev/null
+++ b/include/GALATIC/README.md
@@ -0,0 +1,217 @@
+# GALATIC
+
+Sparse Matrix-Sparse Matrix Multiplication CUDA Template library over generalized semirings.
+
+This repository was forked from [AC-SpGEMM](https://github.com/GPUPeople/ACSpGEMM).
+
+This was developed/Tested with
+* Linux 4.12
+* CUDA compilation tools 11.1
+* A V100
+
+---
+
+## Quickstart Guide
+
+### **Orientation**
+
+The headers you likely need for minimal functionality (exclusion of `CSR.cuh` is possible, if you load directly  to/from GPU memory).
+
+```c++
+#include "GALATIC/include/CSR.cuh"
+#include "GALATIC/include/dCSR.cuh"
+#include "GALATIC/include/SemiRingInterface.h"
+#include "GALATIC/source/device/Multiply.cuh"
+```
+
+Where `CSR.cuh` is used to represent matrix storage in the [Compressed Sparse Row format](https://en.wikipedia.org/wiki/Sparse_matrix) for matrices in CPU memory.  `dCSR.cuh` is the same, but represents data that is stored in GPU/device memory.  
+
+(Note: there exists a `convert` function  in `dCSR.cuh` for converting between the two. The GPU version is required to perform matrix multiplication)
+
+We recommend you look over these files two files, as you will need to construct the input matrices yourself.
+
+Additionally there is a `COO.cuh` for use with the coordinate list format which can be converted to `CSR` (but not `coo` to `dCSR`). The conversion is not particularly optimized.
+
+### **Defining Semirings**
+
+To define your semiring, you statically extend the "abstract" class defined in `SemiRingInterface.h` 
+```C++
+// SemiRingInterface.h
+template <typename T, typename U, typename V>
+struct SemiRing {
+    typedef T leftInput_t;
+    typedef U rightInput_t;
+    typedef V output_t;     // Don't worry about these typedefs for now 
+    
+    V multiply(const T& a, const U& b);
+    V add(const V& a, const V& b);
+
+    V AdditiveIdentity();
+};
+```
+
+Notice that multiplication has a left input type `T`, a right input type `U`, and an output type `V`. Addition has `V` as both an input and an output.
+
+An example follows where multiplication and addition are defined canonically using doubles.
+
+The `__device__` annotation is required.  The `__host__` annotation is needed in if you would like to verify against a CPU SpGEMM implementaiton.
+
+``` c++
+// Define Your Semiring
+struct Arith_SR : SemiRing<double, double, double>
+{
+  __host__ __device__ double multiply(const double& a, const double& b) { return a * b; }
+  __host__ __device__ double add(const double& a, const double& b)      { return a + b; }
+  __host__ __device__ static double AdditiveIdentity()                  { return     0; }
+};
+
+```
+You may use the "Semiring" structure (e.g. `Arith_SR`) to hold data from outside the matrix (i.e. global device memory) by storing say, a pointer. This will affect performance. 
+
+As to be expected, only memory which is accesible from the GPU is valid. In addition, you should be careful as to not mutate anything such that data races could occur or that an order of operations becomes required.
+
+Use of constructors / destructors is not reccomended for your semiring struct. The destructor for this will be ran multiple times before multiplication is complete. Ideally the Semiring should be [trivally copyable](https://en.cppreference.com/w/cpp/named_req/TriviallyCopyable). Thus you must manually free resources your semiring uses (if any) after you are done.  Additionally, `T`/`U`/`V`  (input / output types) should also be trivially copyable.
+
+
+### Performing Matrix Multiplication
+
+To decrease the chance of bad error messages, we reccomend using `SEMIRING_TYPE::leftInput_t`, `SEMIRING_TYPE::rightInput_t` and `SEMIRING_TYPE::output_t` for your matrices instead of the literal types of `T` and `U`. This will ensure any type errors occur in your code, rather than the heavily templated library codes.  It will additionally help prevent errors that claim the multiplication function using your parametesr are not found.
+
+```C++
+CSR<Arith_SR::leftInput_t> input_A_CPU;
+CSR<Arith_SR::rightInput_t> input_B_CPU;
+
+CSR<Arith_SR::output_t> result_mat_CPU;
+
+dCSR<Arith_SR::leftInput_t> input_A_GPU;
+dCSR<Arith_SR::rightInput_t> input_B_GPU;
+
+dCSR<Arith_SR::output_t> result_mat_GPU;
+
+
+/* ...
+   ... load data into input_A_CPU, input_B_CPU
+   ...*/
+
+// Transfer input matrices onto GPU
+// conver out <- in
+convert(input_A_GPU, input_A_CPU);
+convert(input_B_GPU, input_B_CPU);
+
+// load data into semiring struct. For this one, we don't need to do anything,
+// but you still need to pass it in for generality. The cost is trivial.
+Arith_SR semiring;
+
+
+// Setup execution options, we'll skip the details for now.
+
+const int Threads = 256;
+const int BlocksPerMP = 1;
+const int NNZPerThread = 2;
+const int InputElementsPerThreads = 2;
+const int RetainElementsPerThreads = 1;
+const int MaxChunksToMerge = 16;
+const int MaxChunksGeneralizedMerge = 256; // MAX: 865
+const int MergePathOptions = 8;
+
+
+GPUMatrixMatrixMultiplyTraits  DefaultTraits(Threads, BlocksPerMP, NNZPerThread,
+                                             InputElementsPerThreads, RetainElementsPerThreads,
+                                             MaxChunksToMerge, MaxChunksGeneralizedMerge, MergePathOptions);
+
+const bool Debug_Mode = true;
+DefaultTraits.preferLoadBalancing = true;
+ExecutionStats stats;
+stats.measure_all = false;
+
+// Actually perform the matrix multiplicaiton
+ACSpGEMM::Multiply<Arith_SR>(input_A_GPU, input_B_GPU, result_mat_GPU, DefaultTraits, stats, Debug_Mode, semiring);
+
+
+// load results  onto CPU.
+convert(result_mat_GPU, result_mat_GPU);
+
+```
+
+A minimal working example is located in `minimal_example.cu` (note, contains different code currrently).
+
+compile it with
+
+`$ nvcc minimal_example.cu --ftemplate-backtrace-limit 1 --expt-relaxed-constexpr`
+
+Note: `--expt-relaxed-constexpr` is required.
+
+
+----
+
+
+
+### Testing
+You can the output against a simple CPU version. (Matrix values, row offsets, column id's).
+
+Simply add the header
+```cpp 
+#include "GALATIC/include/TestSpGEMM.cuh"
+```
+
+and execute	
+
+```cpp
+TestSpGEMM(input_A_GPU, input_B_GPU, semiring, [=] (const Arith_SR::output_t &a, const Arith_SR::output_t &b) { return std::abs(a-b) < 0.01; }, DefaultTraits);
+```
+
+Default traits is the configuration traits, as 
+above. 
+
+The lambda function is function which takes two of your output type, and returns true if they are equivalent, otherwise false. 
+
+Make sure your semiring functions are marked with `__host__`. Addditionally, if you are accessing datastructures outside the matrix, `cudaMallocManaged` is reccomended, as then both the CPU and GPU can access the memory using the same code. 
+
+---
+## Important Information
+
+
+AC-SpGEMM is highly configurable as can be seen with the traits in the `performTestCase`, these traits are implemented as template parameters.
+Hence, for all combinations used, the **respective instantiation must be present**.
+Instantiations can be created by modifying the call to `Multiply` in `source/GPU/Multiply.cu` in line 781, which is given as
+```cpp
+bool called = 
+	EnumOption<256, 256, 128, // Threads
+	EnumOption<3, 4, 1, // BlocksPerMP
+	EnumOption<2, 2, 1, // NNZPerThread
+	EnumOption<4, 4, 1, // InputElementsPerThreads
+	EnumOption<4, 4, 1, // RetainElementsPerThreads
+	EnumOption<16, 16, 8, // MaxChunksToMerge
+	EnumOption<256, 512, 256, // MaxChunksGeneralizedMerge
+	EnumOption<8, 8, 8, // MergePathOptions
+	EnumOption<0, 1, 1>>>>>>>>> // DebugMode
+			::call(Selection<MultiplyCall<DataType>>(call), scheduling_traits.Threads, scheduling_traits.BlocksPerMp, scheduling_traits.NNZPerThread, scheduling_traits.InputElementsPerThreads, scheduling_traits.RetainElementsPerThreads, scheduling_traits.MaxChunksToMerge, scheduling_traits.MaxChunksGeneralizedMerge, scheduling_traits.MergePathOptions, (int)Debug_Mode);
+```
+This expanding template will instantiate variants of `MultiplyCall` with the parameters specified in `EnumOption<Start, End, Step>`, so each EnumOption describes all the possible values for a certain property and all different configurations will be instantiated (e.g. BlocksPerMP with `EnumOption<3, 4, 1,` will instantiate the template call with BlocksPerMP=3 and BlocksPerMP=4)
+
+These parameters may require adjusting for optimal performance, or to just run if your semiring is especially large.
+
+---
+
+# About
+
+GALATIC: GPU Accelerated Sparse Matrix Multiplication over Arbitrary 
+Semirings (GALATIC) Copyright (c) 2020-2021, The Regents of the 
+University of California, through Lawrence Berkeley National Laboratory 
+(subject to receipt of any required approvals from the U.S. Dept. of Energy),
+Richard Lettich, and GPUPeople. All rights reserved.
+
+If you have questions about your rights to use or distribute this software,
+please contact Berkeley Lab's Intellectual Property Office at
+IPO@lbl.gov.
+
+NOTICE.  This Software was developed under funding from the U.S. Department
+of Energy and the U.S. Government consequently retains certain rights.  As
+such, the U.S. Government has been granted for itself and others acting on
+its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
+Software to reproduce, distribute copies to the public, prepare derivative 
+works, and perform publicly and display publicly, and to permit others to do so.
+
+# FAQ
+richardl@berkeley.edu
+
diff --git a/include/GALATIC/a.out b/include/GALATIC/a.out
new file mode 100755
index 00000000..4dd7fd7d
Binary files /dev/null and b/include/GALATIC/a.out differ
diff --git a/include/GALATIC/gmon.out b/include/GALATIC/gmon.out
new file mode 100644
index 00000000..267ab310
Binary files /dev/null and b/include/GALATIC/gmon.out differ
diff --git a/include/GALATIC/include/COO.cuh b/include/GALATIC/include/COO.cuh
new file mode 100644
index 00000000..7d7bfd06
--- /dev/null
+++ b/include/GALATIC/include/COO.cuh
@@ -0,0 +1,262 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include "Vector.h"
+#include <iostream>
+#include <memory>
+
+#include "Vector.h"
+
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <stdexcept>
+#include <iterator>
+#include <vector>
+template<typename T>
+struct COO
+{
+	size_t rows, cols, nnz;
+
+	std::unique_ptr<T[]> data;
+	std::unique_ptr<unsigned int[]> row_ids;
+	std::unique_ptr<unsigned int[]> col_ids;
+
+	COO() : rows(0), cols(0), nnz(0) { }
+	void alloc(size_t rows, size_t cols, size_t nnz);
+};
+
+
+namespace {
+    struct DataTypeValidator {
+        static bool validate(std::string type) {
+            return false;
+        }
+    };
+/*
+    template<>
+    struct DataTypeValidator<float> {
+        static const bool validate(std::string type) {
+            return type.compare("real") == 0 || type.compare("integer") == 0;
+        }
+    };
+    template<typename VALUE_TYPE>
+    struct DataTypeValidator  {
+        static const bool validate(std::string type) {
+           std::cout << "type: " << type << std::endl;
+            return type.compare("real") == 0 || type.compare("integer") == 0;;
+        }
+    };
+
+    template<>
+    struct DataTypeValidator<int> {
+        static const bool validate(std::string type) {
+            return type.compare("integer") == 0;
+        }
+    };
+
+    template<>
+    struct DataTypeValidator<uint64_t> {
+        static const bool validate(std::string type) {
+            return type.compare("integer") == 0;
+        }
+    };*/
+}
+
+template<typename T>
+void COO<T>::alloc(size_t r, size_t c, size_t n)
+{
+    rows = r;
+    cols = c;
+    nnz = n;
+
+    data = std::make_unique<T[]>(n);
+    row_ids = std::make_unique<unsigned int[]>(n);
+    col_ids = std::make_unique<unsigned int[]>(n);
+}
+
+template<typename T>
+COO<T> loadMTX(const char * file)
+{
+    std::ifstream fstream(file);
+    if (!fstream.is_open())
+        throw std::runtime_error(std::string("could not open \"") + file + "\"");
+
+    COO<T> resmatrix;
+    size_t num_rows = 0, num_columns = 0, num_non_zeroes = 0;
+
+    size_t line_counter = 0;
+    std::string line;
+    bool pattern = false;
+    bool hermitian = false;
+    // read header;
+    std::getline(fstream, line);
+    if (line.compare(0, 32, "%%MatrixMarket matrix coordinate") != 0)
+        throw std::runtime_error("Can only read MatrixMarket format that is in coordinate form");
+    std::istringstream iss(line);
+    std::vector<std::string> tokens{ std::istream_iterator<std::string>{iss}, std::istream_iterator<std::string>{} };
+    bool complex = false;
+
+    if (tokens[3] == "pattern")
+        pattern = true;
+    else if (tokens[3] == "complex")
+        complex = true;
+    else if (tokens[3] != "real")
+        throw std::runtime_error("MatrixMarket data type does not match matrix format");
+    bool symmetric = false;
+//    if (tokens[4].compare("general") == 0)
+        symmetric = false;
+  //  else if (tokens[4].compare("symmetric") == 0)
+//        symmetric = true;
+   // else if (tokens[4].compare("Hermitian") == 0)
+   //     hermitian = true;
+  //  else
+    //    throw std::runtime_error("Can only read MatrixMarket format that is either symmetric, general or hermitian");
+
+    while (std::getline(fstream, line))
+    {
+        ++line_counter;
+        if (line[0] == '%')
+            continue;
+        std::istringstream liness(line);
+        liness >> num_rows >> num_columns >> num_non_zeroes;
+        if (liness.fail())
+            throw std::runtime_error(std::string("Failed to read matrix market header from \"") + file + "\"");
+        //std::cout << "Read matrix header" << std::endl;
+        //std::cout << "rows: " << rows << " columns: " << columns << " nnz: " << nnz << std::endl;
+        break;
+    }
+
+    size_t reserve = num_non_zeroes;
+    if (symmetric || hermitian)
+        reserve *= 2;
+
+    resmatrix.alloc(num_rows, num_columns, reserve);
+
+    //read data
+    size_t read = 0;
+    while (std::getline(fstream, line))
+    {
+        ++line_counter;
+        if (line[0] == '%')
+            continue;
+
+        std::istringstream liness(line);
+
+
+        do
+        {
+            char ch;
+            liness.get(ch);
+            if (!isspace(ch))
+            {
+                liness.putback(ch);
+                break;
+            }
+
+        } while (!liness.eof());
+        if (liness.eof() || line.length() == 0)
+            continue;
+
+        uint32_t r, c;
+        T d;
+        liness >> r >> c;
+        if (pattern)
+            d = 0;// T::Init(1);
+        else {
+            double a;
+            liness >> a;
+            d =0;// T::Init(a);
+        }
+        if (liness.fail())
+            throw std::runtime_error(std::string("Failed to read data at line ") + std::to_string(line_counter) + " from matrix market file \"" + file + "\"");
+        if (r > num_rows)
+            throw std::runtime_error(std::string("Row index out of bounds at line  ") + std::to_string(line_counter) + " in matrix market file \"" + file + "\"");
+        if (c > num_columns)
+            throw std::runtime_error(std::string("Column index out of bounds at line  ") + std::to_string(line_counter) + " in matrix market file \"" + file + "\"");
+
+        resmatrix.row_ids[read] = r - 1;
+        resmatrix.col_ids[read] = c - 1;
+        resmatrix.data[read] = d;
+        ++read;
+        if ((symmetric || hermitian) && r != c)
+        {
+            resmatrix.row_ids[read] = c - 1;
+            resmatrix.col_ids[read] = r - 1;
+            resmatrix.data[read] = d;
+            ++read;
+        }
+    }
+
+    resmatrix.nnz = read;
+    return resmatrix;
+}
+
+
+
+template<typename T>
+COO<T> loadCOO(const char * file)
+{
+    return COO<T>();
+}
+
+template<typename T>
+void storeCOO(const COO<T>& mat, const char * file)
+{
+
+}
+
+template<typename T>
+void spmv(DenseVector<T>& res, const COO<T>& m, const DenseVector<T>& v, bool transpose)
+{
+    if (transpose && v.size != m.rows)
+        throw std::runtime_error("SPMV dimensions mismatch");
+    if (!transpose && v.size != m.cols)
+        throw std::runtime_error("SPMV dimensions mismatch");
+
+    size_t outsize = transpose ? m.cols : m.rows;
+    if (res.size < outsize)
+        res.data = std::make_unique<T[]>(outsize);
+    res.size = outsize;
+
+    std::fill(&res.data[0], &res.data[0] + outsize, 0);
+
+
+    if(transpose)
+        for (size_t i = 0; i < m.nnz; ++i)
+            res.data[m.col_ids[i]] += m.data[i] * v.data[m.row_ids[i]];
+    else
+        for (size_t i = 0; i < m.nnz; ++i)
+            res.data[m.row_ids[i]] += m.data[i] * v.data[m.col_ids[i]];
+}
+
diff --git a/include/GALATIC/include/COO.h b/include/GALATIC/include/COO.h
new file mode 100644
index 00000000..1dd8bee5
--- /dev/null
+++ b/include/GALATIC/include/COO.h
@@ -0,0 +1,62 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include "Vector.h"
+
+#include <memory>
+
+namespace GALATIC {
+template<typename T>
+struct COO
+{
+	size_t rows, cols, nnz;
+
+	std::unique_ptr<T[]> data;
+	std::unique_ptr<unsigned int[]> row_ids;
+	std::unique_ptr<unsigned int[]> col_ids;
+
+	COO() : rows(0), cols(0), nnz(0) { }
+	void alloc(size_t rows, size_t cols, size_t nnz);
+};
+
+template<typename T>
+COO<T> loadMTX(const char* file);
+template<typename T>
+COO<T> loadCOO(const char* file);
+template<typename T>
+void storeCOO(const COO<T>& mat, const char* file);
+
+template<typename T>
+void spmv(DenseVector<T>& res, const COO<T>& m, const DenseVector<T>& v, bool transpose = false);
+
+}
diff --git a/include/GALATIC/include/CPU_SpGEMM.h b/include/GALATIC/include/CPU_SpGEMM.h
new file mode 100644
index 00000000..98e54e4c
--- /dev/null
+++ b/include/GALATIC/include/CPU_SpGEMM.h
@@ -0,0 +1,104 @@
+#include <algorithm>
+
+#include <vector>
+#include "CSR.cuh"
+#include "dCSR.cuh"
+
+#pragma once
+
+ template<typename T>
+ using Vec = std::vector<T>;
+
+     template <typename T> 
+     struct CSR_Tuple {
+         uint64_t col;
+         T value;
+         CSR_Tuple(uint64_t col, T value) : col(col), value(value) {}
+     };
+
+
+ template<typename SEMIRING_t,
+          typename LEFT_T  = typename SEMIRING_t::leftInput_t,        // input type alias  for mul
+          typename RIGHT_T  = typename SEMIRING_t::rightInput_t,      // input type alias  for mul
+          typename OUT_t = typename SEMIRING_t::output_t              // output type alias for mul
+ >
+ void Mult_CPU( CSR<LEFT_T> &A,  CSR<RIGHT_T> &B, CSR<OUT_t>& C, SEMIRING_t& sr)
+ {
+     	
+
+     Vec<CSR_Tuple<OUT_t>> result = Vec<CSR_Tuple<OUT_t>>();
+     Vec<uint64_t> row_starts = Vec<uint64_t>();
+
+    int last_percent = 0;
+
+    Vec<CSR_Tuple<OUT_t>> temp_buffer = Vec<CSR_Tuple<OUT_t>>();
+
+
+     for (uint64_t A_row_idx = 0; A_row_idx < A.rows; A_row_idx++)
+     {
+         if (A_row_idx*10 / A.rows > last_percent) {
+             std::cout  << "CPU Done%: " << A_row_idx*100 / A.rows  <<std::endl;
+             last_percent = A_row_idx*10 / A.rows;
+         }
+         const uint64_t A_row_start =  A.row_offsets[A_row_idx];
+         const uint64_t A_row_end   =  A_row_idx + 1 >= A.rows ? A.nnz :  A.row_offsets[A_row_idx+1];
+        
+        temp_buffer.clear();
+         // for every element A_r,k in row A_row_idx
+         for (uint64_t A_element_idx = A_row_start; A_element_idx < A_row_end; A_element_idx++)
+         {
+             const LEFT_T &A_element = A.data[A_element_idx];
+
+             // for every element B_k,c
+
+
+             uint64_t A_col_idx = A.col_ids[A_element_idx];
+
+             uint64_t B_row_start = B.row_offsets[A_col_idx];
+             uint64_t B_row_end = A_col_idx + 1 >= B.rows ? B.nnz :  B.row_offsets[A_col_idx+1]; 
+
+
+             for (uint64_t c_star = B_row_start; c_star < B_row_end; c_star++){
+                 const RIGHT_T & B_element = B.data[c_star];
+                 uint64_t b_col = B.col_ids[c_star];                
+                 auto jq  =sr.multiply(A_element, B_element);
+                 temp_buffer.push_back(CSR_Tuple<OUT_t>(b_col, jq ));
+             }
+
+
+         }
+
+         std::sort(
+             temp_buffer.begin(),
+             temp_buffer.end(),
+            [] (const CSR_Tuple<OUT_t> &a, const CSR_Tuple<OUT_t> &b)  {  return a.col < b.col; }
+         );
+
+
+         int64_t last_col = -1;
+         row_starts.push_back(result.size());
+         for (auto & ele : temp_buffer) {
+             if (ele.col != last_col) {
+                 result.push_back(ele);
+             } else {
+                 result[result.size() -1] = CSR_Tuple<OUT_t>(ele.col, sr.add(result[result.size() -1].value, ele.value));
+             }
+             last_col = ele.col;
+         }
+     }
+
+     C.alloc(A.rows,B.cols, result.size());
+
+     for (int i = 0; i < result.size(); i++) {
+         C.data[i] = result.at(i).value;
+         C.col_ids[i] = result.at(i).col;
+     }
+
+     row_starts.push_back(result.size());
+
+     C.row_offsets[0] =0;
+     for (int i = 0; i < A.rows+1; i++) {
+         C.row_offsets[i] = row_starts.at(i);
+     }
+
+ } 
\ No newline at end of file
diff --git a/include/GALATIC/include/CSR.cuh b/include/GALATIC/include/CSR.cuh
new file mode 100644
index 00000000..41d0072b
--- /dev/null
+++ b/include/GALATIC/include/CSR.cuh
@@ -0,0 +1,338 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include <memory>
+#include <algorithm>
+#include <math.h>
+#include <cstring>
+
+#include "COO.cuh"
+
+#include <stdint.h>
+#include <string>
+#include <fstream>
+#include <stdexcept>
+#include <iterator>
+#include <vector>
+#include <algorithm>
+#include <memory>
+#include <iostream>
+
+#pragma once
+
+template<typename T>
+struct COO;
+
+template<typename T>
+struct DenseVector;
+
+template<typename T>
+struct CSR
+{
+	struct Statistics
+	{
+		double mean;
+		double std_dev;
+		size_t max;
+		size_t min;
+	};
+
+	void computeStatistics(double& mean, double& std_dev, size_t& max, size_t& min)
+	{
+		// running variance by Welford
+		size_t count = 0;
+		mean = 0;
+		double M2 = 0;
+		max = 0;
+		min = cols;
+		for (size_t i = 0; i < rows; ++i)
+		{
+			size_t r_length = row_offsets[i + 1] - row_offsets[i];
+			min = std::min(min, r_length);
+			max = std::max(max, r_length);
+			++count;
+			double newValue = static_cast<double>(r_length);
+			double delta = newValue - mean;
+			mean = mean + delta / count;
+			double delta2 = newValue - mean;
+			M2 = M2 + delta * delta2;
+		}
+		if (count < 2)
+			std_dev = 0;
+		else
+			std_dev = sqrt(M2 / (count - 1));
+	}
+
+	Statistics rowStatistics()
+	{
+		Statistics stats;
+		computeStatistics(stats.mean, stats.std_dev, stats.max, stats.min);
+		return stats;
+	}
+
+	size_t rows, cols, nnz;
+
+	std::unique_ptr<T[]> data;
+	std::unique_ptr<unsigned int[]> row_offsets;
+	std::unique_ptr<unsigned int[]> col_ids;
+
+	CSR() : rows(0), cols(0), nnz(0), data(std::unique_ptr<T[]>(new T[0])) {
+	}
+	void alloc(size_t rows, size_t cols, size_t nnz);
+
+	// CSR<T>& operator=(CSR<T> other)
+	// {
+	// 	this->rows = other.rows;
+	// 	this->cols = other.cols;
+	// 	this->nnz = other.nnz;
+	// 	this->data = std::move(other.data);
+	// 	this->row_offsets = std::move(other.row_offsets);
+	// 	this->col_ids = std::move(other.col_ids);
+	// 	return *this;
+	// }
+
+	// CSR(const CSR<T>& other)
+	// {
+	// 	this->rows = other.rows;
+	// 	this->cols = other.cols;
+	// 	this->nnz = other.nnz;
+	// 	this->data = std::make_unique<T[]>(other.nnz);
+	// 	memcpy(this->data.get(), other.data.get(), sizeof(T) * other.nnz);
+	// 	this->col_ids = std::make_unique<unsigned int[]>(other.nnz);
+	// 	memcpy(this->col_ids.get(), other.col_ids.get(), sizeof(unsigned int) * other.nnz);
+	// 	this->row_offsets = std::make_unique<unsigned int[]>(other.rows + 1);
+	// 	memcpy(this->row_offsets.get(), other.row_offsets.get(), sizeof(unsigned int) * (other.rows + 1));
+	// }
+
+};
+
+
+
+namespace {
+    template<typename VALUE_TYPE>
+    struct State
+    {
+        typedef VALUE_TYPE ValueType;
+
+        bool transpose;
+
+        State() :  transpose(false) { }
+        State(bool transpose) :  transpose(transpose) { }
+    };
+
+    struct CSRIOHeader
+    {
+        static constexpr char Magic[] = { 'H','i', 1, 'C','o','m','p','s','d' };
+
+        char magic[sizeof(Magic)];
+        uint64_t typesize;
+        uint64_t compresseddir;
+        uint64_t indexsize;
+        uint64_t fixedoffset;
+        uint64_t offsetsize;
+        uint64_t num_rows, num_columns;
+        uint64_t num_non_zeroes;
+
+        CSRIOHeader() = default;
+
+
+        template<typename T>
+        static uint64_t typeSize()
+        {
+            return sizeof(T);
+        }
+
+        template<typename T>
+        CSRIOHeader(const CSR<T>& mat)
+        {
+            for (size_t i = 0; i < sizeof(Magic); ++i)
+                magic[i] = Magic[i];
+            typesize = typeSize<T>();
+            compresseddir = 0;
+            indexsize = typeSize<uint32_t>();
+            fixedoffset = 0;
+            offsetsize = typeSize<uint32_t>();
+
+            num_rows = mat.rows;
+            num_columns = mat.cols;
+            num_non_zeroes = mat.nnz;
+        }
+
+        bool checkMagic() const
+        {
+            for (size_t i = 0; i < sizeof(Magic); ++i)
+                if (magic[i] != Magic[i])
+                    return false;
+            return true;
+        }
+    };
+    constexpr char CSRIOHeader::Magic[];
+}
+
+template<typename T>
+void CSR<T>::alloc(size_t r, size_t c, size_t n)
+{
+    rows = r;
+    cols = c;
+    nnz = n;
+
+    data = std::make_unique<T[]>(n);
+    col_ids = std::make_unique<unsigned int[]>(n);
+    row_offsets = std::make_unique<unsigned int[]>(r+1);
+}
+
+template<typename T>
+CSR<T> loadCSR(const char * file)
+{
+    std::ifstream fstream(file, std::fstream::binary);
+    if (!fstream.is_open())
+        throw std::runtime_error(std::string("could not open \"") + file + "\"");
+
+    CSRIOHeader header;
+    State<T> state;
+    fstream.read(reinterpret_cast<char*>(&header), sizeof(CSRIOHeader));
+    if (!fstream.good())
+        throw std::runtime_error("Could not read CSR header");
+    if (!header.checkMagic())
+        throw std::runtime_error("File does not appear to be a CSR Matrix");
+
+    fstream.read(reinterpret_cast<char*>(&state), sizeof(state));
+    if (!fstream.good())
+        throw std::runtime_error("Could not read CompressedMatrix state");
+    if (header.typesize != CSRIOHeader::typeSize<T>())
+        throw std::runtime_error("File does not contain a CSR matrix with matching type");
+
+    CSR<T> res;
+    res.alloc(header.num_rows, header.num_columns, header.num_non_zeroes);
+
+    fstream.read(reinterpret_cast<char*>(&res.data[0]), res.nnz * sizeof(T));
+    fstream.read(reinterpret_cast<char*>(&res.col_ids[0]), res.nnz * sizeof(unsigned int));
+    fstream.read(reinterpret_cast<char*>(&res.row_offsets[0]), (res.rows+1) * sizeof(unsigned int));
+
+    if (!fstream.good())
+        throw std::runtime_error("Could not read CSR matrix data");
+
+    return res;
+}
+
+template<typename T>
+void storeCSR(const CSR<T>& mat, const char * file)
+{
+    std::ofstream fstream(file, std::fstream::binary);
+    if (!fstream.is_open())
+        throw std::runtime_error(std::string("could not open \"") + file + "\"");
+
+    CSRIOHeader header(mat);
+    State<T> state;
+    fstream.write(reinterpret_cast<char*>(&header), sizeof(CSRIOHeader));
+    fstream.write(reinterpret_cast<const char*>(&state), sizeof(state));
+    fstream.write(reinterpret_cast<char*>(&mat.data[0]), mat.nnz * sizeof(T));
+    fstream.write(reinterpret_cast<char*>(&mat.col_ids[0]), mat.nnz * sizeof(unsigned int));
+    fstream.write(reinterpret_cast<char*>(&mat.row_offsets[0]), (mat.rows + 1) * sizeof(unsigned int));
+
+}
+
+template<typename T>
+void spmv(DenseVector<T>& res, const CSR<T>& m, const DenseVector<T>& v, bool transpose)
+{
+    if (transpose && v.size != m.rows)
+        throw std::runtime_error("SPMV dimensions mismatch");
+    if (!transpose && v.size != m.cols)
+        throw std::runtime_error("SPMV dimensions mismatch");
+
+    size_t outsize = transpose ? m.cols : m.rows;
+    if (res.size < outsize)
+        res.data = std::make_unique<T[]>(outsize);
+    res.size = outsize;
+
+    if (transpose)
+    {
+        std::fill(&res.data[0], &res.data[0] + m.cols, 0);
+        for (size_t i = 0; i < m.rows; ++i)
+        {
+            for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o)
+                res.data[m.col_ids[o]] += m.data[o] * v.data[i];
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < m.rows; ++i)
+        {
+            T val = 0;
+            for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o)
+                val += m.data[o] * v.data[m.col_ids[o]];
+            res.data[i] = val;
+        }
+    }
+}
+
+template<typename T>
+void convert(CSR<T>& res, const COO<T>& coo)
+{
+    struct Entry
+    {
+        unsigned int r, c;
+        T v;
+        bool operator < (const Entry& other)
+        {
+            if (r != other.r)
+                return r < other.r;
+            return c < other.c;
+        }
+    };
+
+    std::vector<Entry> entries;
+    std::cout << coo.nnz << std::endl;
+    entries.reserve(coo.nnz);
+    for (size_t i = 0; i < coo.nnz; ++i)
+        entries.push_back(Entry{ coo.row_ids[i], coo.col_ids[i], coo.data[i] });
+    std::sort(std::begin(entries), std::end(entries));
+
+    res.alloc(coo.rows, coo.cols, coo.nnz);
+    std::fill(&res.row_offsets[0], &res.row_offsets[coo.rows], 0);
+    for (size_t i = 0; i < coo.nnz; ++i)
+    {
+        res.data[i] = entries[i].v;
+        res.col_ids[i] = entries[i].c;
+        ++res.row_offsets[entries[i].r];
+    }
+
+    unsigned int off = 0;
+    for (size_t i = 0; i < coo.rows; ++i)
+    {
+        unsigned int n = off + res.row_offsets[i];
+        res.row_offsets[i] = off;
+        off = n;
+    }
+    res.row_offsets[coo.rows] = off;
+}
diff --git a/include/GALATIC/include/CSR.h b/include/GALATIC/include/CSR.h
new file mode 100644
index 00000000..dd0444fd
--- /dev/null
+++ b/include/GALATIC/include/CSR.h
@@ -0,0 +1,340 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include <memory>
+#include <algorithm>
+#include <math.h>
+#include <cstring>
+#include "COO.h"
+
+#include <stdint.h>
+#include <string>
+#include <fstream>
+#include <stdexcept>
+#include <iterator>
+#include <vector>
+#include <algorithm>
+#include <memory>
+#include <iostream>
+
+
+namespace GALATIC {
+template<typename T>
+struct COO;
+
+template<typename T>
+struct DenseVector;
+
+template<typename T>
+struct CSR
+{
+	struct Statistics
+	{
+		double mean;
+		double std_dev;
+		size_t max;
+		size_t min;
+	};
+
+	void computeStatistics(double& mean, double& std_dev, size_t& max, size_t& min)
+	{
+		// running variance by Welford
+		size_t count = 0;
+		mean = 0;
+		double M2 = 0;
+		max = 0;
+		min = cols;
+		for (size_t i = 0; i < rows; ++i)
+		{
+			size_t r_length = row_offsets[i + 1] - row_offsets[i];
+			min = std::min(min, r_length);
+			max = std::max(max, r_length);
+			++count;
+			double newValue = static_cast<double>(r_length);
+			double delta = newValue - mean;
+			mean = mean + delta / count;
+			double delta2 = newValue - mean;
+			M2 = M2 + delta * delta2;
+		}
+		if (count < 2)
+			std_dev = 0;
+		else
+			std_dev = sqrt(M2 / (count - 1));
+	}
+
+	Statistics rowStatistics()
+	{
+		Statistics stats;
+		computeStatistics(stats.mean, stats.std_dev, stats.max, stats.min);
+		return stats;
+	}
+
+	size_t rows, cols, nnz;
+
+	std::unique_ptr<T[]> data;
+	std::unique_ptr<unsigned int[]> row_offsets;
+	std::unique_ptr<unsigned int[]> col_ids;
+
+	CSR() : rows(0), cols(0), nnz(0) { }
+	void alloc(size_t rows, size_t cols, size_t nnz);
+
+	// CSR<T>& operator=(CSR<T> other)
+	// {
+	// 	this->rows = other.rows;
+	// 	this->cols = other.cols;
+	// 	this->nnz = other.nnz;
+	// 	this->data = std::move(other.data);
+	// 	this->row_offsets = std::move(other.row_offsets);
+	// 	this->col_ids = std::move(other.col_ids);
+	// 	return *this;
+	// }
+
+	// CSR(const CSR<T>& other)
+	// {
+	// 	this->rows = other.rows;
+	// 	this->cols = other.cols;
+	// 	this->nnz = other.nnz;
+	// 	this->data = std::make_unique<T[]>(other.nnz);
+	// 	memcpy(this->data.get(), other.data.get(), sizeof(T) * other.nnz);
+	// 	this->col_ids = std::make_unique<unsigned int[]>(other.nnz);
+	// 	memcpy(this->col_ids.get(), other.col_ids.get(), sizeof(unsigned int) * other.nnz);
+	// 	this->row_offsets = std::make_unique<unsigned int[]>(other.rows + 1);
+	// 	memcpy(this->row_offsets.get(), other.row_offsets.get(), sizeof(unsigned int) * (other.rows + 1));
+	// }
+
+};
+
+
+
+namespace {
+    template<typename VALUE_TYPE>
+    struct State
+    {
+        typedef VALUE_TYPE ValueType;
+
+        ValueType scaling;
+        bool transpose;
+
+        State() : scaling(1), transpose(false) { }
+        State(ValueType scaling, bool transpose) : scaling(scaling), transpose(transpose) { }
+    };
+
+    struct CSRIOHeader
+    {
+        static constexpr char Magic[] = { 'H','i', 1, 'C','o','m','p','s','d' };
+
+        char magic[sizeof(Magic)];
+        uint64_t typesize;
+        uint64_t compresseddir;
+        uint64_t indexsize;
+        uint64_t fixedoffset;
+        uint64_t offsetsize;
+        uint64_t num_rows, num_columns;
+        uint64_t num_non_zeroes;
+
+        CSRIOHeader() = default;
+
+
+        template<typename T>
+        static uint64_t typeSize()
+        {
+            return sizeof(T);
+        }
+
+        template<typename T>
+        CSRIOHeader(const CSR<T>& mat)
+        {
+            for (size_t i = 0; i < sizeof(Magic); ++i)
+                magic[i] = Magic[i];
+            typesize = typeSize<T>();
+            compresseddir = 0;
+            indexsize = typeSize<uint32_t>();
+            fixedoffset = 0;
+            offsetsize = typeSize<uint32_t>();
+
+            num_rows = mat.rows;
+            num_columns = mat.cols;
+            num_non_zeroes = mat.nnz;
+        }
+
+        bool checkMagic() const
+        {
+            for (size_t i = 0; i < sizeof(Magic); ++i)
+                if (magic[i] != Magic[i])
+                    return false;
+            return true;
+        }
+    };
+    constexpr char CSRIOHeader::Magic[];
+}
+
+template<typename T>
+void CSR<T>::alloc(size_t r, size_t c, size_t n)
+{
+    rows = r;
+    cols = c;
+    nnz = n;
+
+    data = std::make_unique<T[]>(n);
+    col_ids = std::make_unique<unsigned int[]>(n);
+    row_offsets = std::make_unique<unsigned int[]>(r+1);
+}
+
+template<typename T>
+CSR<T> loadCSR(const char * file)
+{
+    std::ifstream fstream(file, std::fstream::binary);
+    if (!fstream.is_open())
+        throw std::runtime_error(std::string("could not open \"") + file + "\"");
+
+    CSRIOHeader header;
+    State<T> state;
+    fstream.read(reinterpret_cast<char*>(&header), sizeof(CSRIOHeader));
+    if (!fstream.good())
+        throw std::runtime_error("Could not read CSR header");
+    if (!header.checkMagic())
+        throw std::runtime_error("File does not appear to be a CSR Matrix");
+
+    fstream.read(reinterpret_cast<char*>(&state), sizeof(state));
+    if (!fstream.good())
+        throw std::runtime_error("Could not read CompressedMatrix state");
+    if (header.typesize != CSRIOHeader::typeSize<T>())
+        throw std::runtime_error("File does not contain a CSR matrix with matching type");
+
+    CSR<T> res;
+    res.alloc(header.num_rows, header.num_columns, header.num_non_zeroes);
+
+    fstream.read(reinterpret_cast<char*>(&res.data[0]), res.nnz * sizeof(T));
+    fstream.read(reinterpret_cast<char*>(&res.col_ids[0]), res.nnz * sizeof(unsigned int));
+    fstream.read(reinterpret_cast<char*>(&res.row_offsets[0]), (res.rows+1) * sizeof(unsigned int));
+
+    if (!fstream.good())
+        throw std::runtime_error("Could not read CSR matrix data");
+
+    return res;
+}
+
+template<typename T>
+void storeCSR(const CSR<T>& mat, const char * file)
+{
+    std::ofstream fstream(file, std::fstream::binary);
+    if (!fstream.is_open())
+        throw std::runtime_error(std::string("could not open \"") + file + "\"");
+
+    CSRIOHeader header(mat);
+    State<T> state;
+    fstream.write(reinterpret_cast<char*>(&header), sizeof(CSRIOHeader));
+    fstream.write(reinterpret_cast<const char*>(&state), sizeof(state));
+    fstream.write(reinterpret_cast<char*>(&mat.data[0]), mat.nnz * sizeof(T));
+    fstream.write(reinterpret_cast<char*>(&mat.col_ids[0]), mat.nnz * sizeof(unsigned int));
+    fstream.write(reinterpret_cast<char*>(&mat.row_offsets[0]), (mat.rows + 1) * sizeof(unsigned int));
+
+}
+
+template<typename T>
+void spmv(DenseVector<T>& res, const CSR<T>& m, const DenseVector<T>& v, bool transpose)
+{
+    if (transpose && v.size != m.rows)
+        throw std::runtime_error("SPMV dimensions mismatch");
+    if (!transpose && v.size != m.cols)
+        throw std::runtime_error("SPMV dimensions mismatch");
+
+    size_t outsize = transpose ? m.cols : m.rows;
+    if (res.size < outsize)
+        res.data = std::make_unique<T[]>(outsize);
+    res.size = outsize;
+
+    if (transpose)
+    {
+        std::fill(&res.data[0], &res.data[0] + m.cols, 0);
+        for (size_t i = 0; i < m.rows; ++i)
+        {
+            for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o)
+                res.data[m.col_ids[o]] += m.data[o] * v.data[i];
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < m.rows; ++i)
+        {
+            T val = 0;
+            for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o)
+                val += m.data[o] * v.data[m.col_ids[o]];
+            res.data[i] = val;
+        }
+    }
+}
+
+template<typename T>
+void convert(CSR<T>& res, const COO<T>& coo)
+{
+    struct Entry
+    {
+        unsigned int r, c;
+        T v;
+        bool operator < (const Entry& other)
+        {
+            if (r != other.r)
+                return r < other.r;
+            return c < other.c;
+        }
+    };
+
+    std::vector<Entry> entries;
+    std::cout << coo.nnz << std::endl;
+    entries.reserve(coo.nnz);
+    for (size_t i = 0; i < coo.nnz; ++i)
+        entries.push_back(Entry{ coo.row_ids[i], coo.col_ids[i], coo.data[i] });
+    std::sort(std::begin(entries), std::end(entries));
+
+    res.alloc(coo.rows, coo.cols, coo.nnz);
+    std::fill(&res.row_offsets[0], &res.row_offsets[coo.rows], 0);
+    for (size_t i = 0; i < coo.nnz; ++i)
+    {
+        res.data[i] = entries[i].v;
+        res.col_ids[i] = entries[i].c;
+        ++res.row_offsets[entries[i].r];
+    }
+
+    unsigned int off = 0;
+    for (size_t i = 0; i < coo.rows; ++i)
+    {
+        unsigned int n = off + res.row_offsets[i];
+        res.row_offsets[i] = off;
+        off = n;
+    }
+    res.row_offsets[coo.rows] = off;
+}
+
+
+};
diff --git a/include/GALATIC/include/ColorText.h b/include/GALATIC/include/ColorText.h
new file mode 100644
index 00000000..8c939162
--- /dev/null
+++ b/include/GALATIC/include/ColorText.h
@@ -0,0 +1,25 @@
+#include <ostream>
+#pragma once
+namespace Color {
+    enum Code {
+        FG_RED      = 31,
+        FG_GREEN    = 32,
+        FG_YELLOW    = 93,
+
+        FG_BLUE     = 34,
+        FG_DEFAULT  = 39,
+        BG_RED      = 41,
+        BG_GREEN    = 42,
+        BG_BLUE     = 44,
+        BG_DEFAULT  = 49
+    };
+    class Modifier {
+        Code code;
+    public:
+        Modifier(Code pCode) : code(pCode) {}
+        friend std::ostream&
+        operator<<(std::ostream& os, const Modifier& mod) {
+            return os << "\033[" << mod.code << "m";
+        }
+    };
+}
\ No newline at end of file
diff --git a/include/GALATIC/include/Compare.cuh b/include/GALATIC/include/Compare.cuh
new file mode 100644
index 00000000..a577e314
--- /dev/null
+++ b/include/GALATIC/include/Compare.cuh
@@ -0,0 +1,109 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+* Compare.h
+*
+* ac-SpGEMM
+*
+* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+*------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include "dCSR.cuh"
+#include <stdio.h>
+#include "common.h"
+
+
+namespace ACSpGEMM {
+
+    template<typename DataType>
+    __global__ void d_compare(int in_rows, int in_cols, const uint32_t *__restrict reference_offset,
+                              const uint32_t *__restrict reference_indices, const DataType *__restrict reference_values,
+                              const uint32_t *__restrict compare_offset, const uint32_t *__restrict compare_indices,
+                              const DataType *__restrict compare_values, bool compare_data, double epsilon,
+                              uint32_t *verification) {
+        int tid = threadIdx.x + blockDim.x * blockIdx.x;
+        if (tid >= in_rows)
+            return;
+
+        uint32_t ref_offset = reference_offset[tid];
+        uint32_t comp_offset = compare_offset[tid];
+        uint32_t ref_number_entries = reference_offset[tid + 1] - ref_offset;
+        uint32_t comp_number_entries = compare_offset[tid + 1] - comp_offset;
+
+        if (ref_number_entries != comp_number_entries) {
+#ifdef VERIFICATION_TEXT
+            printf("---------- Row: %u | Row length not identical: (Ref|Comp) : (%u|%u)\n",tid, ref_number_entries, comp_number_entries);
+#endif
+            *verification = 1;
+        }
+
+        uint32_t num_entries = min(ref_number_entries, comp_number_entries);
+
+        for (uint32_t i = 0; i < num_entries; ++i) {
+            if (reference_indices[ref_offset + i] != compare_indices[comp_offset + i]) {
+#ifdef VERIFICATION_TEXT
+                printf("Row: %u | Row indices do NOT match: (Ref|Comp) : (%u|%u) - pos: %u/%u\n", tid, reference_indices[ref_offset + i], compare_indices[comp_offset + i], i, num_entries);
+#endif
+                *verification = 1;
+            }
+            if (compare_data) {
+                if (!(reference_values[ref_offset + i] == compare_values[comp_offset + i])) {
+#ifdef VERIFICATION_TEXT
+                    printf("Row: %u | Values do NOT match: (Ref|Comp) : (%f|%f) - pos: %u/%u\n", tid, reference_values[ref_offset + i], compare_values[comp_offset + i], i, num_entries);
+#endif
+                    *verification = 1;
+                }
+            }
+        }
+    }
+
+    template<typename DataType>
+    bool Compare(const dCSR<DataType> &reference_mat, const dCSR<DataType> &compare_mat, bool compare_data) {
+        int blockSize(256);
+        int gridSize(divup<int>(reference_mat.rows + 1, blockSize));
+        double epsilon = 0.1;
+        uint32_t *verification, h_verification;
+        cudaMalloc(&verification, sizeof(uint32_t));
+        cudaMemset(verification, 0, sizeof(uint32_t));
+
+        d_compare<DataType> <<< gridSize, blockSize >>> (reference_mat.rows, reference_mat.cols,
+                reference_mat.row_offsets, reference_mat.col_ids, reference_mat.data,
+                compare_mat.row_offsets, compare_mat.col_ids, compare_mat.data,
+                compare_data, epsilon, verification);
+
+        cudaMemcpy(&h_verification, verification, sizeof(uint32_t), cudaMemcpyDeviceToHost);
+        return (h_verification == 0);
+    }
+}
\ No newline at end of file
diff --git a/include/GALATIC/include/Compare.h b/include/GALATIC/include/Compare.h
new file mode 100644
index 00000000..1fc2cce9
--- /dev/null
+++ b/include/GALATIC/include/Compare.h
@@ -0,0 +1,123 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+* Compare.h
+*
+* ac-SpGEMM
+*
+* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+*------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include "dCSR.h"
+
+#include "common.h"
+
+
+namespace ACSpGEMM {
+
+    template <typename DataType>
+    __global__ void d_compare(int in_rows, int in_cols, const uint32_t* __restrict reference_offset, const uint32_t* __restrict reference_indices, const DataType* __restrict reference_values,
+                              const uint32_t* __restrict compare_offset, const uint32_t* __restrict compare_indices, const DataType* __restrict compare_values, bool compare_data, double epsilon, uint32_t* verification)
+    {
+        int tid = threadIdx.x + blockDim.x * blockIdx.x;
+        if (tid >= in_rows)
+            return;
+
+        uint32_t ref_offset = reference_offset[tid];
+        uint32_t comp_offset = compare_offset[tid];
+        uint32_t ref_number_entries = reference_offset[tid + 1] - ref_offset;
+        uint32_t comp_number_entries = compare_offset[tid + 1] - comp_offset;
+
+        if (ref_number_entries != comp_number_entries)
+        {
+#ifdef VERIFICATION_TEXT
+            printf("---------- Row: %u | Row length not identical: (Ref|Comp) : (%u|%u)\n",tid, ref_number_entries, comp_number_entries);
+#endif
+            *verification = 1;
+        }
+
+        uint32_t num_entries = min(ref_number_entries, comp_number_entries);
+
+        for (uint32_t i = 0; i < num_entries; ++i)
+        {
+            if (reference_indices[ref_offset + i] != compare_indices[comp_offset + i])
+            {
+#ifdef VERIFICATION_TEXT
+                printf("Row: %u | Row indices do NOT match: (Ref|Comp) : (%u|%u) - pos: %u/%u\n", tid, reference_indices[ref_offset + i], compare_indices[comp_offset + i], i, num_entries);
+#endif
+                *verification = 1;
+            }
+            if (compare_data)
+            {
+                if (reference_values[ref_offset + i] != compare_values[comp_offset + i])
+                {
+#ifdef VERIFICATION_TEXT
+                    printf("Row: %u | Values do NOT match: (Ref|Comp) : (%f|%f) - pos: %u/%u\n", tid, reference_values[ref_offset + i], compare_values[comp_offset + i], i, num_entries);
+#endif
+                    *verification = 1;
+                }
+            }
+        }
+
+        return;
+    }
+	template <typename DataType> bool Compare(const dCSR<DataType>& reference_mat, const dCSR<DataType>& compare_mat, bool compare_data);
+    template <typename DataType>
+    bool Compare(const dCSR<DataType>& reference_mat, const dCSR<DataType>& compare_mat, bool compare_data)
+    {
+        int blockSize(256);
+        int gridSize(divup<int>(reference_mat.rows + 1, blockSize));
+        double epsilon = 0.1;
+        uint32_t* verification, h_verification;
+        cudaMalloc(&verification, sizeof(uint32_t));
+        cudaMemset(verification, 0, sizeof(uint32_t));
+
+        d_compare<DataType> << <gridSize, blockSize >> > (reference_mat.rows, reference_mat.cols,
+                reference_mat.row_offsets, reference_mat.col_ids, reference_mat.data,
+                compare_mat.row_offsets, compare_mat.col_ids, compare_mat.data,
+                compare_data, epsilon, verification);
+
+        cudaMemcpy(&h_verification, verification, sizeof(uint32_t), cudaMemcpyDeviceToHost);
+        return (h_verification == 0);
+        }
+////
+//	template bool Compare<float>(const dCSR<float>& reference_mat, const dCSR<float>& compare_mat, bool compare_data);
+//	template bool Compare<double>(const dCSR<double>& reference_mat, const dCSR<double>& compare_mat, bool compare_data);
+//    template bool Compare<uint32_t>( dCSR<uint32_t> const& reference_mat,  dCSR<uint32_t> const& compare_mat, bool compare_data);
+        // template<typename T> bool Compare<T>( dCSR<T> const& reference_mat,  dCSR<T> const& compare_mat, bool compare_data);
+
+}
+//
+//
diff --git a/include/GALATIC/include/CustomExceptions.h b/include/GALATIC/include/CustomExceptions.h
new file mode 100644
index 00000000..53003f28
--- /dev/null
+++ b/include/GALATIC/include/CustomExceptions.h
@@ -0,0 +1,85 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+* CustomExceptions.h
+*
+* ac-SpGEMM
+*
+* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+*------------------------------------------------------------------------------
+*/
+#pragma once
+
+#include <exception>
+
+class SpGEMMException : public std::exception
+{
+public:
+	virtual char const * what()  const noexcept{ return "SpGEMM: Stage failed"; }
+};
+
+class MergeSimpleCaseException : public std::exception
+{
+public:
+virtual char const * what() const noexcept { return "MERGE: Simple Case failed"; }
+};
+
+class MergeMaxChunksCaseException : public std::exception
+{
+public:
+	virtual char const * what() const noexcept { return "MERGE: Max Chunks Case failed"; }
+};
+
+class MergeGeneralizedCaseException : public std::exception
+{
+public:
+	virtual char const * what() const noexcept { return "MERGE: Generalized Case failed"; }
+};
+
+class MergeLoopingException : public std::exception
+{
+public:
+	virtual char const * what() const noexcept { return "MERGE: Merge Stage took longer than 10 seconds"; }
+};
+
+class RestartOutOfMemoryException : public std::exception
+{
+public:
+	virtual char const * what() const noexcept { return "RESTART: SpGEMM out of memory"; }
+};
+
+class RestartOutOfChunkPointerException : public std::exception
+{
+public:
+	virtual char const * what() const noexcept { return "RESTART: SpGEMM out of chunk pointers"; }
+};
+
diff --git a/include/GALATIC/include/GALATIC.cuh b/include/GALATIC/include/GALATIC.cuh
new file mode 100644
index 00000000..0088231d
--- /dev/null
+++ b/include/GALATIC/include/GALATIC.cuh
@@ -0,0 +1,49 @@
+
+#pragma once
+namespace GALATIC {
+template<typename T>
+void convert(CSR<T>& dst, const dCSR<T>& src, unsigned int padding=0)
+{
+    dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
+    dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
+    cudaMemcpy(dst.data.get(), src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dst.col_ids.get(), src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dst.row_offsets.get(), src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+}
+
+};
+
+
+
+
+// template<typename IDX_t, typename VALUE_t> 
+// cusp::csr_matrix< IDX_t,  VALUE_t, cusp::device_memory> to_cusp_csr( CSR<VALUE_t>& orig_mat)
+// {
+//     cusp::csr_matrix<IDX_t, VALUE_t, cusp::host_memory> result_cpu(orig_mat.rows, orig_mat.cols, orig_mat.nnz);
+
+//     for (int i = 0; i < orig_mat.rows; i++) {
+//         result_cpu.row_offsets[i] = orig_mat.row_offsets[i];
+//     }
+
+//     for (int i = 0; i < orig_mat.nnz; i++) {
+//         result_cpu.column_indices[i] = orig_mat.col_ids[i];
+//         result_cpu.values[i] = orig_mat.data[i];
+//     }
+
+//     cusp::csr_matrix<IDX_t, VALUE_t, cusp::device_memory> result(result_cpu);
+//     return result;
+// }
+ 
+
+// template<typename IDX_t, typename SEMIRING_t> 
+// void CuspMultiplyWrapper(cusp::csr_matrix< IDX_t, typename SEMIRING_t::input_t, cusp::device_memory>& A,
+//                          cusp::csr_matrix< IDX_t, typename SEMIRING_t::input_t, cusp::device_memory>& B, 
+//                          cusp::csr_matrix< IDX_t, typename SEMIRING_t::output_t, cusp::device_memory>& C,
+//                          SEMIRING_t sr) {
+//     cusp::multiply( A,B,C,   [] __device__ (typename SEMIRING_t::output_t a ) {return SEMIRING_t::AdditiveIdentity();  }, 
+//                [sr] __device__ ( typename SEMIRING_t::input_t a,  typename SEMIRING_t::input_t b)  {return sr.multiply(a,b); },
+//                [sr] __device__  ( typename SEMIRING_t::output_t a, typename SEMIRING_t::output_t b) {
+//                    auto q= sr.add(a,b);
+//                         return q; } );
+
+// }
\ No newline at end of file
diff --git a/include/GALATIC/include/GALATIC.h b/include/GALATIC/include/GALATIC.h
new file mode 100644
index 00000000..c37626d7
--- /dev/null
+++ b/include/GALATIC/include/GALATIC.h
@@ -0,0 +1,50 @@
+
+#include <cusp/csr_matrix.h>
+#include <cusp/multiply.h>
+
+
+namespace GALATIC {
+template<typename T>
+void convert(CSR<T>& dst, const dCSR<T>& src, unsigned int padding=0)
+{
+    dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
+    dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
+    cudaMemcpy(dst.data.get(), src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dst.col_ids.get(), src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dst.row_offsets.get(), src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+}
+
+};
+
+template<typename IDX_t, typename VALUE_t> 
+cusp::csr_matrix< IDX_t,  VALUE_t, cusp::device_memory> to_cusp_csr( CSR<VALUE_t>& orig_mat)
+{
+    cusp::csr_matrix<IDX_t, VALUE_t, cusp::host_memory> result_cpu(orig_mat.rows, orig_mat.cols, orig_mat.nnz);
+
+    for (int i = 0; i < orig_mat.rows; i++) {
+        result_cpu.row_offsets[i] = orig_mat.row_offsets[i];
+    }
+
+    for (int i = 0; i < orig_mat.nnz; i++) {
+        result_cpu.column_indices[i] = orig_mat.col_ids[i];
+        result_cpu.values[i] = orig_mat.data[i];
+    }
+
+    cusp::csr_matrix<IDX_t, VALUE_t, cusp::device_memory> result(result_cpu);
+    return result;
+}
+
+
+
+
+template<typename IDX_t, typename SEMIRING_t> 
+void CuspMultiplyWrapper(cusp::csr_matrix< IDX_t, typename SEMIRING_t::input_t, cusp::device_memory>& A,
+                         cusp::csr_matrix< IDX_t, typename SEMIRING_t::input_t, cusp::device_memory>& B, 
+                         cusp::csr_matrix< IDX_t, typename SEMIRING_t::output_t, cusp::device_memory>& C,
+                         SEMIRING_t sr) {
+    cusp::multiply(A,B,C,  __device__ [] (auto a) { return a;}, 
+             __device__  [sr] (thrust::device_reference<typename SEMIRING_t::input_t>& a const, thrust::device_reference<typename SEMIRING_t::input_t>& b const)  {return sr.multiply(a,b); },
+              __device__  [sr](typename SEMIRING_t::output_t& a const, typename SEMIRING_t::output_t & b const ) {return sr.add(a,b); } );
+
+    
+}
\ No newline at end of file
diff --git a/include/GALATIC/include/MergeCaseOffsets.h b/include/GALATIC/include/MergeCaseOffsets.h
new file mode 100644
index 00000000..d48364e0
--- /dev/null
+++ b/include/GALATIC/include/MergeCaseOffsets.h
@@ -0,0 +1,49 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include <stdlib.h>
+
+// To circumnavigate a problem with nvcc and std::tuple as described here: https://devtalk.nvidia.com/default/topic/1028112/cuda-setup-and-installation/nvcc-bug-related-to-gcc-6-lt-tuple-gt-header-/
+
+struct MergeCaseOffsets{
+	size_t shared_rows_simple;
+	size_t shared_rows_max_chunks;
+	size_t shared_rows_generalized;
+	size_t shared_rows_simple_rows;
+
+	MergeCaseOffsets():
+	shared_rows_simple(0), shared_rows_max_chunks(0), shared_rows_generalized(0), shared_rows_simple_rows(0){}
+
+	MergeCaseOffsets(size_t simple, size_t max, size_t generalized, size_t simple_rows):
+	shared_rows_simple(simple), shared_rows_max_chunks(max), shared_rows_generalized(generalized), shared_rows_simple_rows(simple_rows){}
+};
\ No newline at end of file
diff --git a/include/GALATIC/include/Multiply.h b/include/GALATIC/include/Multiply.h
new file mode 100644
index 00000000..9823360d
--- /dev/null
+++ b/include/GALATIC/include/Multiply.h
@@ -0,0 +1,58 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+* Multiply.h
+*
+* ac-SpGEMM
+*
+* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+*------------------------------------------------------------------------------
+*/
+#pragma once
+
+#include "dCSR.h"
+#include "execution_stats.h"
+#include "default_scheduling_traits.h"
+
+static void HandleError( cudaError_t err,
+                        const char *file,
+                        int line ) {
+   if (err != cudaSuccess) {
+       printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
+               file, line );
+               std::cout << std::flush;
+		  throw std::exception();
+   }
+}
+#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
+
+
diff --git a/include/GALATIC/include/SemiRingInterface.h b/include/GALATIC/include/SemiRingInterface.h
new file mode 100644
index 00000000..61e17ff4
--- /dev/null
+++ b/include/GALATIC/include/SemiRingInterface.h
@@ -0,0 +1,33 @@
+//
+// Created by Richard Lettich on 4/13/21.
+//
+
+#ifndef ACSPGEMM_FUNCTION_INTERFACE_H
+#define ACSPGEMM_FUNCTION_INTERFACE_H
+
+template <typename T, typename U, typename V>
+struct SemiRing {
+    typedef T leftInput_t;
+    typedef U rightInput_t;
+    typedef V output_t;
+
+    V multiply(const T& a, const U& b);
+    V add(const V& a, const V& b);
+
+    V AdditiveIdentity();
+};
+
+
+
+#endif //ACSPGEMM_FUNCTION_INTERFACE_H
+
+
+
+
+
+
+
+
+
+
+
diff --git a/include/GALATIC/include/TestSpGEMM.cuh b/include/GALATIC/include/TestSpGEMM.cuh
new file mode 100644
index 00000000..d65c14c6
--- /dev/null
+++ b/include/GALATIC/include/TestSpGEMM.cuh
@@ -0,0 +1,112 @@
+
+#include <assert.h>
+#include "CPU_SpGEMM.h"
+#include "CSR.cuh"
+#include "dCSR.cuh"
+#include "../source/device/Multiply.cuh"
+
+template<typename SEMIRING_T, typename F>
+void TestSpGEMM( dCSR<typename SEMIRING_T::leftInput_t>& A, dCSR<typename SEMIRING_T::rightInput_t>& B, SEMIRING_T semiring, F equiv_rel, GPUMatrixMatrixMultiplyTraits& traits)
+{
+
+	//bool checkBitStability{true};
+	ExecutionStats stats, warmupstats, output_stats;
+	stats.measure_all = false;
+	output_stats.measure_all = false;
+
+	dCSR<typename SEMIRING_T::output_t> result_mat;
+
+	std::cout << "starting GPU matrix multiply" << std::endl;
+
+	ACSpGEMM::Multiply<SEMIRING_T>(A, B, result_mat, traits, warmupstats, true, semiring);
+    cudaDeviceSynchronize();
+    std::cout << "GPU matrix multiply Done"  << std::endl;
+
+
+
+    // Convert input matrices
+
+    CSR<typename SEMIRING_T::leftInput_t> A_cpu;
+    CSR<typename SEMIRING_T::rightInput_t> B_cpu;
+
+    convert(A_cpu, A);
+
+    convert(B_cpu, B);
+
+    cudaDeviceSynchronize();
+
+    //convert gpu result to cpu
+    CSR<typename SEMIRING_T::output_t> GPU_result_cpu;
+    cudaDeviceSynchronize();
+
+	convert(GPU_result_cpu, result_mat);
+
+    cudaDeviceSynchronize();
+
+
+    CSR<typename SEMIRING_T::output_t> CPU_result_cpu;
+	Mult_CPU<SEMIRING_T>(A_cpu, B_cpu, CPU_result_cpu, semiring);
+
+    std::cout << "Checking = # Rows, Cols, NNZ....";
+    assert(CPU_result_cpu.rows == GPU_result_cpu.rows);
+  std::cout << "Cpu "<< CPU_result_cpu.cols << "gpu " << GPU_result_cpu.cols;
+    assert(CPU_result_cpu.cols == GPU_result_cpu.cols);
+    assert(CPU_result_cpu.nnz == GPU_result_cpu.nnz);
+
+    std::cout << " correct" << std::endl; 
+   
+    std::cout << "Checking Equivalency for non zeros...";
+
+
+    int correct = 0;
+    for (int i = 0; i < CPU_result_cpu.nnz; i++) {
+        if (equiv_rel(CPU_result_cpu.data[i], GPU_result_cpu.data[i])) {
+            correct++;
+        } 
+    }
+
+    std::cout << "num correct  " << correct <<  "/ "  << CPU_result_cpu.nnz << std::endl;
+    assert(correct == CPU_result_cpu.nnz);
+
+    std::cout << " correct" << std::endl;
+
+
+    std::cout << "Checking Equivalency for Column Id's...";
+
+
+   
+
+    int correct_col_ids = 0;
+    for (int i = 0; i < CPU_result_cpu.nnz; i++) {
+        if (CPU_result_cpu.col_ids[i] == GPU_result_cpu.col_ids[i]) {
+            correct_col_ids++;
+        }
+    }
+
+    assert(correct_col_ids == CPU_result_cpu.nnz);
+
+    std::cout << " correct" << std::endl;
+
+    std::cout << "Checking Equivalency for Row offsets's...";
+
+   int cor_row_ids = 0;
+
+
+
+
+    for (int i = 0; i < CPU_result_cpu.rows+1; i++) {
+        if (CPU_result_cpu.row_offsets[i] == GPU_result_cpu.row_offsets[i]) {
+            cor_row_ids++;
+
+        } else {
+            std::cout << " issue at " << i<< " with " <<  CPU_result_cpu.row_offsets[i] << " vs  "<< GPU_result_cpu.row_offsets[i]<< std::endl;
+        }
+    }
+
+    std::cout << cor_row_ids << " correct out of " << CPU_result_cpu.rows+1 << std::endl;
+
+    assert(cor_row_ids == CPU_result_cpu.rows+1);
+    std::cout << " correct" << std::endl;
+    std::cout << "correctness check complete" << std::endl;
+
+}
\ No newline at end of file
diff --git a/include/GALATIC/include/Transpose.h b/include/GALATIC/include/Transpose.h
new file mode 100644
index 00000000..6b89a617
--- /dev/null
+++ b/include/GALATIC/include/Transpose.h
@@ -0,0 +1,157 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+* Transpose.h
+*
+* ac-SpGEMM
+*
+* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+*------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include "dCSR.cuh"
+
+
+// Global includes
+#include <thrust/device_vector.h>
+#include <stdint.h>
+#include "device_launch_parameters.h"
+
+// Local includes
+#include "common.h"
+
+__global__ void d_calulateTransposeDistribution(int in_rows, int in_cols,
+	const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices, uint32_t* output_offset)
+{
+	int tid = threadIdx.x + blockDim.x * blockIdx.x;
+	if (tid >= in_rows)
+		return;
+
+	uint32_t offset = input_offset[tid];
+	uint32_t number_entries = input_offset[tid + 1] - offset;
+
+	for (uint32_t i = 0; i < number_entries; ++i)
+	{
+		atomicAdd(output_offset + input_indices[offset + i], 1);
+	}
+
+	return;
+}
+
+template <typename DataType>
+__global__ void d_findPosition(int in_rows, int in_cols, const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices,
+	const DataType* __restrict input_values, uint32_t* output_offset, uint32_t* output_indices, DataType* output_values, uint32_t* helper, uint32_t* helper_position)
+{
+	int tid = threadIdx.x + blockDim.x * blockIdx.x;
+	if (tid >= in_rows)
+		return;
+
+	uint32_t offset = input_offset[tid];
+	uint32_t number_entries = input_offset[tid + 1] - offset;
+
+	for (uint32_t i = 0; i < number_entries; ++i)
+	{
+		uint32_t row_index = input_indices[offset + i];
+		uint32_t insert_position = atomicAdd(helper + row_index, 1);
+		uint32_t o_offset = output_offset[row_index];
+		helper_position[o_offset + insert_position] = tid;
+	}
+
+	return;
+}
+
+template <typename DataType>
+__global__ void d_writeTranspose(int in_rows, int in_cols, const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices,
+	const DataType* __restrict input_values, uint32_t* output_offset, uint32_t* output_indices, DataType* output_values, uint32_t* helper, uint32_t* helper_position)
+{
+	int tid = threadIdx.x + blockDim.x * blockIdx.x;
+	if (tid >= in_rows)
+		return;
+
+	uint32_t offset = input_offset[tid];
+	uint32_t number_entries = input_offset[tid + 1] - offset;
+
+	for (uint32_t i = 0; i < number_entries; ++i)
+	{
+		uint32_t row_index = input_indices[offset + i];
+		uint32_t actual_position(0);
+		uint32_t entries_output = helper[row_index];
+		uint32_t o_offset = output_offset[row_index];
+		for (uint32_t j = 0; j < entries_output; ++j)
+		{
+			if (helper_position[o_offset + j] < tid)
+				++actual_position;
+		}		
+		output_indices[o_offset + actual_position] = tid;
+		output_values[o_offset + actual_position] = input_values[offset + i];
+	}
+
+	return;
+}
+
+
+	template <typename DataType>
+	void Transpose(const dCSR<DataType>& matIn, dCSR<DataType>& matTransposeOut)
+	{
+		int blockSize(256);
+		int gridSize(divup<int>(matIn.rows + 1, blockSize));
+
+		matTransposeOut.alloc(matIn.cols, matIn.rows, matIn.nnz);
+
+		// Allocate and set helper resources, Memset output vector
+		uint32_t* d_helper_pointer, *d_helper_position;
+		cudaMalloc(&d_helper_pointer, sizeof(uint32_t) * (matTransposeOut.rows + 1));
+		cudaMalloc(&d_helper_position, sizeof(uint32_t) * (matTransposeOut.nnz));
+		cudaMemset(d_helper_pointer, 0, sizeof(uint32_t) * (matTransposeOut.rows + 1));
+		cudaMemset(matTransposeOut.row_offsets, 0, (matTransposeOut.rows + 1) * sizeof(uint32_t));
+
+		// Calculate entry distribution
+		d_calulateTransposeDistribution<<<gridSize , blockSize >>>(matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matTransposeOut.row_offsets);
+
+		// Prefix sum for new offset vector
+		thrust::device_ptr<uint32_t> th_offset_vector(matTransposeOut.row_offsets);
+		thrust::exclusive_scan(th_offset_vector, th_offset_vector + matTransposeOut.rows + 1, th_offset_vector);
+
+		// Find position for insertion (keeping sort order)
+		d_findPosition<DataType> <<<gridSize, blockSize >>> (matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matIn.data, matTransposeOut.row_offsets, matTransposeOut.col_ids, matTransposeOut.data, d_helper_pointer, d_helper_position);
+
+		// Write Transpose
+		d_writeTranspose<DataType> <<<gridSize, blockSize >>> (matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matIn.data, matTransposeOut.row_offsets, matTransposeOut.col_ids, matTransposeOut.data, d_helper_pointer, d_helper_position);
+
+		// Free helper resources
+		cudaFree(d_helper_pointer);
+		cudaFree(d_helper_position);
+
+		return;
+	}
diff --git a/include/GALATIC/include/Vector.h b/include/GALATIC/include/Vector.h
new file mode 100644
index 00000000..5457359a
--- /dev/null
+++ b/include/GALATIC/include/Vector.h
@@ -0,0 +1,48 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include <memory>
+
+template<typename T>
+struct DenseVector
+{
+	size_t size;
+	std::unique_ptr<T[]> data;
+
+	DenseVector() : size(0) { }
+	void alloc(size_t s)
+	{
+		data = std::make_unique<T[]>(s);
+		size = s;
+	}
+};
diff --git a/include/GALATIC/include/common.cuh b/include/GALATIC/include/common.cuh
new file mode 100644
index 00000000..948be63d
--- /dev/null
+++ b/include/GALATIC/include/common.cuh
@@ -0,0 +1,319 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+
+#include "meta_utils.h"
+#include "common.h"
+#include <cuda_runtime.h>
+#include <type_traits>
+
+
+
+/////////////// HELPERS /////////////////////////
+
+const uint32_t WARP_SIZE = 32;
+
+
+
+
+template<int END, int BEGIN = 0>
+struct ConditionalIteration
+{
+	template<typename F>
+	__device__
+		static void iterate(F f)
+	{
+		bool res = f(BEGIN);
+		if (res)
+			ConditionalIteration<END, BEGIN + 1>::iterate(f);
+	}
+};
+
+template<uint32_t END>
+struct ConditionalIteration<END, END>
+{
+	template<typename F>
+	__device__
+		static void iterate(F f)
+	{
+	}
+};
+
+
+template<int Bytes>
+struct VecLoadTypeImpl;
+
+template<>
+struct VecLoadTypeImpl<4>
+{
+	using type = unsigned int;
+};
+template<>
+struct VecLoadTypeImpl<8>
+{
+	using type = uint2;
+};
+template<>
+struct VecLoadTypeImpl<16>
+{
+	using type = uint4;
+};
+
+template<typename T, int N>
+struct VecLoadType
+{
+	using type = typename VecLoadTypeImpl<sizeof(T)*N>::type;
+	union
+	{
+		T data[N];
+		type vec;
+	};
+
+	__device__ __forceinline__ VecLoadType() = default;
+	__device__ __forceinline__ VecLoadType(type v) : vec(v) {};
+};
+
+template<int VecSize, class T, int N>
+__device__ __forceinline__ void warp_load_vectorized(T (&out)[N], const T* in)
+{
+	static_assert(static_popcnt<N>::value == 1, "load_vectorized only works for pow 2 elements");
+	
+	using LoadType = VecLoadType<T, VecSize>;
+	const typename LoadType::type* vec_in = reinterpret_cast<const typename LoadType::type*>(in + (threadIdx.x/WARP_SIZE)*WARP_SIZE*N) + laneid();
+
+	//TODO: get rid of UB by doing an explicit unroll and just use the vec type
+	#pragma unroll
+	for (int i = 0; i < N / VecSize; ++i)
+	{
+		LoadType loaded;
+		loaded.vec = vec_in[i*WARP_SIZE];
+		#pragma unroll
+		for (int j = 0; j < VecSize; ++j)
+			out[i*VecSize + j] = loaded.data[j];
+	}
+}
+
+template<int VecSize, class T, int N>
+__device__ __forceinline__ void vectorized_to_blocked(T(&data)[N])
+{
+	const int Vecs = N / VecSize;
+
+	//rotate
+	#pragma unroll
+	for (int k = 0; k < Vecs - 1; ++k)
+	{
+		if (laneid() % Vecs > k)
+		{
+			T tmp[VecSize];
+			#pragma unroll
+			for (int i = 0; i < VecSize; ++i)
+				tmp[i] = data[(Vecs - 1)*VecSize + i];
+
+			#pragma unroll
+			for (int j = Vecs - 1; j > 0; --j)
+				#pragma unroll
+				for (int i = 0; i < VecSize; ++i)
+					data[j*VecSize + i] = data[(j - 1)*VecSize + i];
+
+			#pragma unroll
+			for (int i = 0; i < VecSize; ++i)
+				data[i] = tmp[i];
+		}
+	}
+
+	//shfl
+	int pad_offset = Vecs - (laneid() * Vecs) / WARP_SIZE;
+	int section_offset = (laneid() * Vecs) % WARP_SIZE;
+
+	#pragma unroll
+	for (int j = 0; j < Vecs; ++j)
+	{
+		int shfl_offset = section_offset + ((pad_offset + j) % Vecs);
+		#pragma unroll
+		for (int i = 0; i < VecSize; ++i)
+			data[j*VecSize + i] = __shfl(data[j*VecSize + i], shfl_offset);
+	}
+	
+	//rotate back
+	#pragma unroll
+	for (int k = 0; k < Vecs - 1; ++k)
+	{
+		if ((laneid() * Vecs) / WARP_SIZE > k)
+		{
+			T tmp[VecSize];
+			#pragma unroll
+			for (int i = 0; i < VecSize; ++i)
+				tmp[i] = data[i];
+
+			#pragma unroll
+			for (int j = 1; j < Vecs; ++j)
+				#pragma unroll
+				for (int i = 0; i < VecSize; ++i)
+					data[(j - 1)*VecSize + i] = data[j*VecSize + i];
+
+			#pragma unroll
+			for (int i = 0; i < VecSize; ++i)
+				data[(Vecs - 1)*VecSize + i] = tmp[i];
+		}
+	}
+}
+
+
+template<class COMP, int LO, int N, int R>
+struct ThreadOddEvenMerge;
+
+template<class COMP, int LO, int N, int R, int M, bool FULL>
+struct ThreadOddEvenMergeImpl;
+
+template<class T>
+__device__ __forceinline__ void swap(T& a, T& b)
+{
+	T temp = a;
+	a = b;
+	b = temp;
+}
+
+template<class COMP, int LO, int N, int R, int M>
+struct ThreadOddEvenMergeImpl<COMP, LO, N, R, M, true>
+{
+	template<class K, int L>
+	__device__ __forceinline__ static void run(K(&key)[L])
+	{
+		ThreadOddEvenMerge<COMP, LO, N, M>::run(key);
+		ThreadOddEvenMerge<COMP, LO + R, N, M>::run(key);
+#pragma unroll
+		for (int i = LO + R; i + R < LO + N; i += M)
+			if (COMP::comp(key[i], key[i + R]))
+				swap(key[i], key[i + R]); 
+	}
+	template<class K, class V, int L>
+	__device__ __forceinline__ static void run(K(&key)[L], V(&value)[L])
+	{
+		ThreadOddEvenMerge<COMP, LO, N, M>::run(key, value);
+		ThreadOddEvenMerge<COMP, LO + R, N, M>::run(key, value);
+#pragma unroll
+		for (int i = LO + R; i + R < LO + N; i += M)
+			if (COMP::comp(key[i], key[i + R]))
+				swap(key[i], key[i + R]),
+				swap(value[i], value[i + R]);
+	}
+};
+template<class COMP, int LO, int N, int R, int M>
+struct ThreadOddEvenMergeImpl<COMP, LO, N, R, M, false>
+{
+	template<class K, int L>
+	__device__ __forceinline__ static void run(K(&key)[L])
+	{
+		if (COMP::comp(key[LO], key[LO + R]))
+			swap(key[LO], key[LO + R]);
+	}
+	template<class K, class V, int L>
+	__device__ __forceinline__ static void run(K(&key)[L], V(&value)[L])
+	{
+		if (COMP::comp(key[LO], key[LO + R]))
+			swap(key[LO], key[LO + R]),
+			swap(value[LO], value[LO + R]);
+	}
+};
+
+
+template<class COMP, int LO, int N, int R>
+struct ThreadOddEvenMerge : public ThreadOddEvenMergeImpl<COMP, LO, N, R, 2 * R, (2 * R < N)>
+{
+};
+
+template<class COMP, int LO, int N>
+struct ThreadOddEvenMergeSort
+{
+	template<class K, int L>
+	__device__ __forceinline__ static void run(K(&key)[L])
+	{
+		ThreadOddEvenMergeSort<COMP, LO, N / 2>::run(key);
+		ThreadOddEvenMergeSort<COMP, LO + N / 2, N / 2>::run(key);
+		ThreadOddEvenMerge<COMP, LO, N, 1>::run(key);
+	}
+	template<class K, class V, int L>
+	__device__ __forceinline__ static void run(K(&key)[L], V(&value)[L])
+	{
+		ThreadOddEvenMergeSort<COMP, LO, N / 2>::run(key, value);
+		ThreadOddEvenMergeSort<COMP, LO + N / 2, N / 2>::run(key, value);
+		ThreadOddEvenMerge<COMP, LO, N, 1>::run(key, value);
+	}
+};
+
+template<class COMP, int LO>
+struct ThreadOddEvenMergeSort<COMP, LO, 1>
+{
+	template<class K, int L>
+	__device__ __forceinline__ static void run(K (&key)[L])
+	{ }
+	template<class K, class V, int L>
+	__device__ __forceinline__ static void run(K (&key)[L], V(&value)[L])
+	{ }
+};
+
+template<class COMP, class K, int L>
+__device__ __forceinline__ void threadOddEvenMergeSort(K(&key)[L])
+{
+	ThreadOddEvenMergeSort<COMP, 0, L>::run(key);
+}
+template<class COMP, class K, class V, int L>
+__device__ __forceinline__ void threadOddEvenMergeSort(K(&key)[L], V(&value)[L])
+{
+	ThreadOddEvenMergeSort<COMP, 0, L>::run(key, value);
+}
+
+struct SortAscending
+{
+	template<class T>
+	__device__ __forceinline__ static bool comp(T a, T b)
+	{
+		return a > b;
+	}
+};
+
+struct SortDescending
+{
+	template<class T>
+	__device__ __forceinline__ static bool comp(T a, T b)
+	{
+		return a < b;
+	}
+};
+
+__device__ __forceinline__ inline uint32_t laneid()
+{
+	uint32_t mylaneid;
+	asm("mov.u32 %0, %laneid;" : "=r" (mylaneid));
+	return mylaneid;
+}
diff --git a/include/GALATIC/include/common.h b/include/GALATIC/include/common.h
new file mode 100644
index 00000000..f2e43944
--- /dev/null
+++ b/include/GALATIC/include/common.h
@@ -0,0 +1,46 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+template<typename T>
+constexpr __host__ __device__ __forceinline__ T divup(T a, T b)
+{
+	return (a + b - 1) / b;
+}
+
+template<typename T>
+constexpr __host__ __device__ __forceinline__ T alignment(const T size, size_t alignment)
+{
+	return divup<T>(size, alignment) * alignment;
+}
diff --git a/include/GALATIC/include/consistent_memory.h b/include/GALATIC/include/consistent_memory.h
new file mode 100644
index 00000000..7e2e0c1e
--- /dev/null
+++ b/include/GALATIC/include/consistent_memory.h
@@ -0,0 +1,109 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include <vector>
+#include <algorithm>
+#include "memory_space.h"
+
+namespace ACSpGEMM {
+	class RegisteredMemory
+	{
+	public:
+		virtual size_t clear() = 0;
+	};
+
+	inline std::vector<RegisteredMemory*>& getRegMemories()
+	{
+		static std::vector<RegisteredMemory*> m;
+		return m;
+	}
+
+	inline void register_consistent_memory(RegisteredMemory* memory)
+	{
+		getRegMemories().push_back(memory);
+	}
+	inline void unregister_consistent_memory(RegisteredMemory* memory)
+	{
+		auto &m = getRegMemories();
+		std::remove(begin(m), end(m), memory);
+	}
+	inline size_t clear_consistentMemory()
+	{
+		size_t s = 0;
+		for (auto m : getRegMemories())
+			s += m->clear();
+		return s;
+	}
+
+	template<MemorySpace>
+	class ConsistentMemory;
+
+	template<class T>
+	class RegisteredMemoryVar : RegisteredMemory
+	{
+		T v;
+		size_t clear() override
+		{
+			v = 0;
+			return 0;
+		}
+	public:
+		RegisteredMemoryVar() : v(0)
+		{
+			register_consistent_memory(this);
+		}
+		explicit RegisteredMemoryVar(T v) : v(v)
+		{
+			register_consistent_memory(this);
+		}
+		~RegisteredMemoryVar()
+		{
+			unregister_consistent_memory(this);
+		}
+
+		RegisteredMemoryVar& operator+= (T add)
+		{
+			v += add;
+			return *this;
+		}
+
+		void operator = (T other)
+		{
+			v = other;
+		}
+		operator T() const noexcept
+		{
+			return v;
+		}
+	};
+}
diff --git a/include/GALATIC/include/dCSR.cuh b/include/GALATIC/include/dCSR.cuh
new file mode 100644
index 00000000..a67cea6e
--- /dev/null
+++ b/include/GALATIC/include/dCSR.cuh
@@ -0,0 +1,163 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+#include "CSR.cuh"
+#include <cuda_runtime.h>
+#include <cstddef>
+#include <algorithm>
+#pragma once
+
+
+template<typename T>
+struct CSR;
+
+template<typename T>
+struct dCSR
+{
+	size_t rows, cols, nnz;
+
+	T* data;
+	unsigned int* row_offsets;
+	unsigned int* col_ids;
+
+	dCSR() : rows(0), cols(0), nnz(0), data(nullptr), row_offsets(nullptr), col_ids(nullptr) { }
+	void alloc(size_t rows, size_t cols, size_t nnz, bool allocOffsets = true);
+	void reset();
+	~dCSR();
+};
+
+
+
+namespace
+{
+    template<typename T>
+    void dealloc(dCSR<T>& mat)
+    {
+        cudaPointerAttributes attr;
+        //cudaPointerGetAttributes(&attr, mat.col_ids);
+        //if (attr.type == 2) {
+        if (mat.col_ids != nullptr)
+            cudaFree(mat.col_ids);
+        //}
+        //cudaPointerGetAttributes(&attr, mat.data);
+        //if (attr.type == 2) 
+        if (mat.data != nullptr)
+            cudaFree(mat.data);
+        //cudaPointerGetAttributes(&attr, mat.row_offsets);
+        //if (attr.type == 2)
+        if (mat.row_offsets != nullptr)
+            cudaFree(mat.row_offsets);
+
+        mat.nnz = 0;
+        mat.col_ids = nullptr;
+        mat.data = nullptr;
+        mat.row_offsets = nullptr;
+        //if(cudaSuccess != cudaGetLastError()) std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
+
+    }
+}
+
+template<typename T>
+void dCSR<T>::alloc(size_t r, size_t c, size_t n, bool allocOffsets)
+{
+    dealloc(*this);
+    rows = r;
+    cols = c;
+    nnz = n;
+    cudaMalloc(&data, sizeof(T)*n);
+    cudaMalloc(&col_ids, sizeof(unsigned int)*n);
+    if (allocOffsets)
+        cudaMalloc(&row_offsets, sizeof(unsigned int)*(r+1));
+}
+template<typename T>
+dCSR<T>::~dCSR()
+{
+    dealloc(*this);
+}
+
+template<typename T>
+void dCSR<T>::reset()
+{
+    dealloc(*this);
+}
+
+
+template<typename T>
+void convert(dCSR<T>& dst, const CSR<T>& src)
+{
+    unsigned int padding=0;
+    dst.alloc(src.rows + padding, src.cols, src.nnz + 8*padding);
+    dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
+    cudaMemcpy(dst.data, &src.data[0], src.nnz * sizeof(T), cudaMemcpyHostToDevice);
+    cudaMemcpy(dst.col_ids, &src.col_ids[0], src.nnz * sizeof(unsigned int), cudaMemcpyHostToDevice);
+    cudaMemcpy(dst.row_offsets, &src.row_offsets[0], (src.rows + 1) * sizeof(unsigned int), cudaMemcpyHostToDevice);
+
+    if (padding)
+    {
+        cudaMemset(dst.data + src.nnz, 0, 8 * padding * sizeof(T));
+        cudaMemset(dst.col_ids + src.nnz, 0, 8 * padding * sizeof(unsigned int));
+        cudaMemset(dst.row_offsets + src.rows + 1, 0, padding * sizeof(unsigned int));
+    }
+}
+
+template<typename T>
+void convert(CSR<T>& dst, const dCSR<T>& src)
+{
+    unsigned int padding= 0;
+    dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
+    dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
+    cudaMemcpy(dst.data.get(), src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dst.col_ids.get(), src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dst.row_offsets.get(), src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+}
+
+template<typename T>
+void convert(dCSR<T>& dst, const dCSR<T>& src)
+{
+    unsigned int padding=0;
+    dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
+    dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
+    cudaMemcpy(dst.data, src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(dst.col_ids, src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(dst.row_offsets, src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToDevice);
+}
+
+template<typename T>
+void convert(CSR<T>& dst, const CSR<T>& src)
+{
+    unsigned int padding=0;
+    dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
+    dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
+    memcpy(dst.data.get(), src.data.get(), dst.nnz * sizeof(T));
+    memcpy(dst.col_ids.get(), src.col_ids.get(), dst.nnz * sizeof(unsigned int));
+    memcpy(dst.row_offsets.get(), src.row_offsets.get(), (dst.rows + 1) * sizeof(unsigned int));
+}
diff --git a/include/GALATIC/include/dCSR.h b/include/GALATIC/include/dCSR.h
new file mode 100644
index 00000000..1e9506d3
--- /dev/null
+++ b/include/GALATIC/include/dCSR.h
@@ -0,0 +1,158 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+#include "CSR.h"
+#include <cuda_runtime.h>
+#include <cstddef>
+#include <algorithm>
+
+namespace GALATIC {
+template<typename T>
+struct dCSR
+{
+	size_t rows, cols, nnz;
+
+	T* data;
+	unsigned int* row_offsets;
+	unsigned int* col_ids;
+
+	dCSR() : rows(0), cols(0), nnz(0), data(nullptr), row_offsets(nullptr), col_ids(nullptr) { }
+	void alloc(size_t rows, size_t cols, size_t nnz, bool allocOffsets = true);
+	void reset();
+	~dCSR();
+};
+
+
+
+namespace
+{
+    template<typename T>
+    void dealloc(dCSR<T>& mat)
+    {
+        if (mat.col_ids != nullptr)
+            cudaFree(mat.col_ids);
+        if (mat.data != nullptr)
+            cudaFree(mat.data);
+        if (mat.row_offsets != nullptr)
+            cudaFree(mat.row_offsets);
+        mat.col_ids = nullptr;
+        mat.data = nullptr;
+        mat.row_offsets = nullptr;
+    }
+}
+
+template<typename T>
+void dCSR<T>::alloc(size_t r, size_t c, size_t n, bool allocOffsets)
+{
+    dealloc(*this);
+    rows = r;
+    cols = c;
+    nnz = n;
+    cudaMalloc(&data, sizeof(T)*n);
+    cudaMalloc(&col_ids, sizeof(unsigned int)*n);
+    if (allocOffsets)
+        cudaMalloc(&row_offsets, sizeof(unsigned int)*(r+1));
+}
+template<typename T>
+dCSR<T>::~dCSR()
+{
+    dealloc(*this);
+}
+
+template<typename T>
+void dCSR<T>::reset()
+{
+    dealloc(*this);
+}
+
+
+template<typename T>
+void convert(dCSR<T>& dst, const CSR<T>& src, unsigned int padding)
+{
+    dst.alloc(src.rows + padding, src.cols, src.nnz + 8*padding);
+    dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
+    cudaMemcpy(dst.data, &src.data[0], src.nnz * sizeof(T), cudaMemcpyHostToDevice);
+    cudaMemcpy(dst.col_ids, &src.col_ids[0], src.nnz * sizeof(unsigned int), cudaMemcpyHostToDevice);
+    cudaMemcpy(dst.row_offsets, &src.row_offsets[0], (src.rows + 1) * sizeof(unsigned int), cudaMemcpyHostToDevice);
+
+    if (padding)
+    {
+        cudaMemset(dst.data + src.nnz, 0, 8 * padding * sizeof(T));
+        cudaMemset(dst.col_ids + src.nnz, 0, 8 * padding * sizeof(unsigned int));
+        cudaMemset(dst.row_offsets + src.rows + 1, 0, padding * sizeof(unsigned int));
+    }
+}
+
+template<typename T>
+void convert(CSR<T>& dst, const dCSR<T>& src, unsigned int padding)
+{
+    dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
+    dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
+    cudaMemcpy(dst.data.get(), src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dst.col_ids.get(), src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dst.row_offsets.get(), src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+}
+
+template<typename T>
+void convert(dCSR<T>& dst, const dCSR<T>& src)
+{
+    unsigned int padding = 0;
+    dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
+    dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
+    cudaMemcpy(dst.data, src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(dst.col_ids, src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(dst.row_offsets, src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToDevice);
+}
+
+template<typename T>
+void convert(CSR<T>& dst, const CSR<T>& src)
+{
+    unsigned int padding = 0;
+    dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
+    dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
+    memcpy(dst.data.get(), src.data.get(), dst.nnz * sizeof(T));
+    memcpy(dst.col_ids.get(), src.col_ids.get(), dst.nnz * sizeof(unsigned int));
+    memcpy(dst.row_offsets.get(), src.row_offsets.get(), (dst.rows + 1) * sizeof(unsigned int));
+}
+
+template<typename T>
+void convert(CSR<T>& dst, const dCSR<T>& src)
+{
+    unsigned int padding= 0;
+    dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding);
+    dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols;
+    cudaMemcpy(dst.data.get(), src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dst.col_ids.get(), src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dst.row_offsets.get(), src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost);
+}
+
+};
diff --git a/include/GALATIC/include/dVector.h b/include/GALATIC/include/dVector.h
new file mode 100644
index 00000000..d4b8be56
--- /dev/null
+++ b/include/GALATIC/include/dVector.h
@@ -0,0 +1,74 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include "Vector.h"
+#include <cuda_runtime.h>
+
+template<typename T>
+struct dDenseVector
+{
+	size_t size;
+	T* data;
+
+	dDenseVector() : size(0), data(nullptr) { }
+	void alloc(size_t s)
+	{
+		if (data != nullptr)
+			cudaFree(data);
+		cudaMalloc(&data, sizeof(T)*s);
+		size = s;
+	}
+	~dDenseVector()
+	{
+		if (data != nullptr)
+			cudaFree(data);
+	}
+};
+
+template<typename T>
+void convert(dDenseVector<T> & dvec, const DenseVector<T>& vec, unsigned int padding = 0)
+{
+	dvec.alloc(vec.size+padding);
+	dvec.size = vec.size;
+
+	cudaMemcpy(dvec.data, &vec.data[0], dvec.size * sizeof(T), cudaMemcpyHostToDevice);
+	if (padding)
+		cudaMemset(dvec.data + dvec.size, 0, padding * sizeof(T));
+}
+
+template<typename T>
+void convert(DenseVector<T> & vec, const dDenseVector<T>& dvec)
+{
+	vec.alloc(dvec.size);
+	cudaMemcpy(&vec.data[0], dvec.data, dvec.size * sizeof(T), cudaMemcpyDeviceToHost);
+}
\ No newline at end of file
diff --git a/include/GALATIC/include/default_scheduling_traits.h b/include/GALATIC/include/default_scheduling_traits.h
new file mode 100644
index 00000000..47ea9946
--- /dev/null
+++ b/include/GALATIC/include/default_scheduling_traits.h
@@ -0,0 +1,80 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+
+struct GeneralSchedulingTraits
+{
+	static const bool MultiGPU = false;
+
+	bool preferLoadBalancing;
+	size_t cpu_threads;
+	int device;
+
+	GeneralSchedulingTraits() : cpu_threads(8), device(0), preferLoadBalancing(true) { }
+};
+
+struct AVX2SchedulingTratis : public GeneralSchedulingTraits{};
+
+struct DefaultSchedulingTraits : public GeneralSchedulingTraits {};
+
+struct GPUMatrixMatrixMultiplyTraits : public GeneralSchedulingTraits
+{
+	const int Threads;
+	const int BlocksPerMp;
+	const int NNZPerThread;
+	const int InputElementsPerThreads;
+	const int RetainElementsPerThreads;
+	const int MaxChunksToMerge;
+	const int MaxChunksGeneralizedMerge;
+	const int MergePathOptions;
+
+
+	GPUMatrixMatrixMultiplyTraits(
+	   const int Threads = 256,
+	   const int BlocksPerMp = 3,
+	   const int NNZPerThread = 2,
+	   const int InputElementsPerThreads = 4,
+	   const int RetainElementsPerThreads = 4,
+	   const int MaxChunksToMerge = 16,
+	   const int MaxChunksGeneralizedMerge = 256,
+	   const int MergePathOptions = 8) :
+		Threads(Threads),
+		BlocksPerMp(BlocksPerMp),
+		NNZPerThread(NNZPerThread),
+		InputElementsPerThreads(InputElementsPerThreads),
+		RetainElementsPerThreads(RetainElementsPerThreads),
+		MaxChunksToMerge(MaxChunksToMerge),
+		MaxChunksGeneralizedMerge(MaxChunksGeneralizedMerge),
+		MergePathOptions(MergePathOptions)
+	{}
+};
diff --git a/include/GALATIC/include/device/ARowStorage.cuh b/include/GALATIC/include/device/ARowStorage.cuh
new file mode 100644
index 00000000..d3c6c4f0
--- /dev/null
+++ b/include/GALATIC/include/device/ARowStorage.cuh
@@ -0,0 +1,173 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * ARowStorage.cuh
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include "../meta_utils.h"
+
+
+template<typename INDEX_TYPE, uint32_t NNZ_PER_BLOCK, uint32_t THREADS, bool ENCODING>
+class ARowStorage;
+
+template<typename INDEX_TYPE, uint32_t NNZ_PER_BLOCK, uint32_t THREADS>
+class ARowStorage<INDEX_TYPE, NNZ_PER_BLOCK, THREADS, false>
+{
+	INDEX_TYPE row_ids[NNZ_PER_BLOCK];
+
+public:
+	using EncodedRowType = INDEX_TYPE;
+
+	__device__ __forceinline__
+		void clear()
+	{
+		#pragma unroll
+		for (uint32_t i = 0; i < NNZ_PER_BLOCK; i += THREADS)
+			row_ids[i + threadIdx.x] = 0;
+	}
+
+	__device__ __forceinline__
+	void storeReference(uint32_t id, INDEX_TYPE row)
+	{
+	}
+
+	__device__ __forceinline__
+	void storeRow(uint32_t id, uint32_t ref, INDEX_TYPE row)
+	{
+		row_ids[id] = row;
+		//printf("direct %d stores row: %d %d %d -> %d gets row %d\n", threadIdx.x, id, ref, row, id, row);
+	}
+	__device__ __forceinline__
+		void storeEncodedRow(uint32_t id, INDEX_TYPE row)
+	{
+		row_ids[id] = row;
+	}
+
+	__device__ __forceinline__
+	INDEX_TYPE getEncodedRow(uint32_t id)
+	{
+		//printf("direct %d req encoded row: %d (which is -> %d)\n", threadIdx.x, id, row_ids[id]);
+		return row_ids[id];
+	}
+
+	__device__ __forceinline__
+	INDEX_TYPE decodeRow(INDEX_TYPE row)
+	{
+		//printf("direct %d decodes row: %d -> %d\n", threadIdx.x, row, row);
+		return row;
+	}
+
+	__device__ __forceinline__
+	static INDEX_TYPE restartRowDecode(uint32_t restart_row, INDEX_TYPE first_row)
+	{
+		return first_row + restart_row;
+	}
+	__device__ __forceinline__
+	static uint32_t restartRowEncode(INDEX_TYPE row, INDEX_TYPE first_row)
+	{
+		return row - first_row;
+	}
+};
+
+template<typename INDEX_TYPE, uint32_t NNZ_PER_BLOCK, uint32_t THREADS>
+class ARowStorage<INDEX_TYPE, NNZ_PER_BLOCK, THREADS, true>
+{
+	using ReferenceType = ChooseBitDataType<static_max<16,32 - count_clz<NNZ_PER_BLOCK - 1>::value>::value>;
+	INDEX_TYPE row_ids[NNZ_PER_BLOCK];
+	ReferenceType references[NNZ_PER_BLOCK];
+
+	
+public:
+
+	using EncodedRowType = uint32_t;
+
+	__device__ __forceinline__
+	void clear()
+	{
+		#pragma unroll
+		for (uint32_t i = 0; i < NNZ_PER_BLOCK; i += THREADS)
+			references[i + threadIdx.x] = 0;
+	}
+
+
+	__device__ __forceinline__
+		void storeReference(EncodedRowType id, INDEX_TYPE row)
+	{
+		row_ids[id] = row;
+		//printf("%d stores ref: %d %d -> %d gets real row %d\n", threadIdx.x, id, row, id, row);
+	}
+
+	__device__ __forceinline__
+		void storeRow(uint32_t id, EncodedRowType ref, INDEX_TYPE row)
+	{
+		references[id] = static_cast<ReferenceType>(ref);
+		//printf("%d stores row: %d %d %d -> %d gets ref %d\n", threadIdx.x, id, ref, row, id, ref);
+	}
+
+	__device__ __forceinline__
+	void storeEncodedRow(uint32_t id, EncodedRowType ref)
+	{
+		references[id] = static_cast<ReferenceType>(ref);
+	}
+
+	__device__ __forceinline__
+		EncodedRowType getEncodedRow(uint32_t id)
+	{
+		//printf("%d req encoded row: %d (which is %d -> %d)\n", threadIdx.x, id, references[id], row_ids[references[id]]);
+		return references[id];
+	}
+
+	__device__ __forceinline__
+	INDEX_TYPE decodeRow(EncodedRowType row)
+	{
+		//printf("%d decodes row: %d -> %d\n", threadIdx.x, row, row_ids[row]);
+		return row_ids[row];
+	}
+
+	__device__ __forceinline__
+		static INDEX_TYPE restartRowDecode(EncodedRowType restart_row, INDEX_TYPE first_row)
+	{
+		return restart_row;
+	}
+	__device__ __forceinline__
+	static uint32_t restartRowEncode(EncodedRowType row, INDEX_TYPE first_row)
+	{
+		return row;
+	}
+};
\ No newline at end of file
diff --git a/include/GALATIC/include/device/Chunk.cuh b/include/GALATIC/include/device/Chunk.cuh
new file mode 100644
index 00000000..e0064f31
--- /dev/null
+++ b/include/GALATIC/include/device/Chunk.cuh
@@ -0,0 +1,290 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * Chunk.cuh
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+#include "../common.h"
+
+using ChunkSortType = uint32_t;
+const int chunk_member_offset = alignment(sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(ChunkSortType), 8);
+
+
+template<typename VALUE_TYPE, typename INDEX_TYPE>
+struct alignas(16) Chunk
+{
+	// with which row does the chunk start
+	INDEX_TYPE firstrow;
+	// the number of matrix entries and column offsets in the chunk
+	uint32_t num_entries;
+	// where does the last (uncompleted) row in the chunk start
+	uint32_t last_row_count;
+	// how many elements are in the first row
+	uint32_t first_row_count;
+	// sortkey
+	ChunkSortType sort_key;
+
+
+
+	__device__ __forceinline__ Chunk(uint32_t num, INDEX_TYPE firstrow, uint32_t firstrowCount = 0, uint32_t lastrowCount = 0, ChunkSortType sortkey = 0) :
+		firstrow(firstrow), num_entries(num), last_row_count(lastrowCount), first_row_count(firstrowCount), sort_key(sortkey)
+	{
+
+	}
+
+	__device__ __forceinline__ static uint32_t size(uint32_t count, bool nextPointers)
+	{
+		uint32_t s = (nextPointers ? 16 : 0) + count*(sizeof(VALUE_TYPE) + sizeof(INDEX_TYPE)) + sizeof(Chunk);
+		return (s + 15) & 0xFFFFFFF0;
+
+	}
+	__device__ __forceinline__
+		static Chunk* place(void* chunks, uint32_t offset, uint32_t num, INDEX_TYPE firstrow, uint32_t firstrowCount = 0, uint32_t lastrowCount = 0, ChunkSortType sortkey = 0)
+	{
+		return new(reinterpret_cast<char*>(chunks) + offset) Chunk(num, firstrow, firstrowCount, lastrowCount, sortkey);
+	}
+	__device__ __forceinline__
+		static Chunk* cast(void* chunks, uint32_t offset)
+	{
+		return reinterpret_cast<Chunk*>(reinterpret_cast<char*>(chunks) + offset);
+	}
+	//__device__ __forceinline__ void write(void* location) const
+	//{
+	//	*reinterpret_cast<uint4*>(location) = *reinterpret_cast<const uint4*>(this);
+	//}
+
+	__device__ __forceinline__ VALUE_TYPE* values_direct(uint32_t count)
+	{
+		return reinterpret_cast<VALUE_TYPE*>(reinterpret_cast<char*>(this) + chunk_member_offset);
+	}
+	__device__ __forceinline__ INDEX_TYPE* indices_direct(uint32_t count)
+	{
+		return reinterpret_cast<INDEX_TYPE*>(reinterpret_cast<char*>(this) + chunk_member_offset + sizeof(VALUE_TYPE)*count);
+	}
+
+	__device__ __forceinline__ const VALUE_TYPE* values_direct(uint32_t count) const
+	{
+		return reinterpret_cast<const VALUE_TYPE*>(reinterpret_cast<const char*>(this) + chunk_member_offset);
+	}
+	__device__ __forceinline__ const INDEX_TYPE* indices_direct(uint32_t count) const
+	{
+		return reinterpret_cast<const INDEX_TYPE*>(reinterpret_cast<const char*>(this) + chunk_member_offset + sizeof(VALUE_TYPE)*count);
+	}
+
+	__device__ __forceinline__ void writeNextFront(Chunk* next)
+	{
+		*reinterpret_cast<Chunk**>(reinterpret_cast<char*>(this) - 16) = next;
+	}
+
+	__device__ __forceinline__ void writeNextBack(Chunk* next)
+	{
+		*reinterpret_cast<Chunk**>(reinterpret_cast<char*>(this) - 8) = next;
+	}
+
+	__device__ __forceinline__ void writeNextPointer(Chunk* next, bool front)
+	{
+		*reinterpret_cast<Chunk**>(reinterpret_cast<char*>(this) - 16 + (front ? 0 : 8)) = next;
+	}
+
+	__device__ __forceinline__ Chunk* readNextFront() const
+	{
+		return *reinterpret_cast<Chunk* const *>(reinterpret_cast<char const *>(this) - 16);
+	}
+
+	__device__ __forceinline__ Chunk* readNextBack() const
+	{
+		return *reinterpret_cast<Chunk* const *>(reinterpret_cast<char const *>(this) - 8);
+	}
+
+	__device__ __forceinline__ void setLastConsumed()
+	{
+		last_row_count = last_row_count | 0x80000000;
+	}
+	__device__ __forceinline__ void setFirstConsumed()
+	{
+		first_row_count = first_row_count | 0x80000000;
+	}
+
+	static const uint32_t StartingOffsetFlag = 0x40000000;
+
+	__device__ __forceinline__ uint32_t startingoffset() const
+	{
+		if ((first_row_count & StartingOffsetFlag) == StartingOffsetFlag)
+			return first_row_count & 0x3FFFFFFF;
+		return 0;
+	}
+
+	__device__ __forceinline__ bool lastConsumed() const
+	{
+		return (last_row_count & 0x80000000) != 0;
+	}
+	__device__ __forceinline__ bool firstConsumed() const
+	{
+		return (first_row_count & 0x80000000) != 0;
+	}
+
+	__device__ __forceinline__ uint32_t lastCountCleared() const
+	{
+		return last_row_count & (~0x80000000);
+	}
+	__device__ __forceinline__ uint32_t firstCountCleared() const
+	{
+		return first_row_count & (~0xC0000000);
+	}
+
+	__device__ __forceinline__ VALUE_TYPE getMultiplier() const
+	{
+		return 1;
+	}
+	__device__ __forceinline__ bool isDirect() const
+	{
+		return last_row_count == 0xFFFFFFFF;
+	}
+};
+
+template<typename VALUE_TYPE, typename INDEX_TYPE>
+__device__ __forceinline__ bool allocChunk(uint32_t count, uint32_t* chunk_alloc, uint32_t chunk_size, uint32_t& offset, int& worstcaseRem, bool nextPointers = true)
+{
+	uint32_t s = Chunk<VALUE_TYPE, INDEX_TYPE>::size(count, nextPointers);
+	worstcaseRem -= s;
+	offset = atomicAdd(chunk_alloc, s) + (nextPointers ? 16 : 0);
+	return offset + s <= chunk_size;
+}
+
+template<typename VALUE_TYPE, typename INDEX_TYPE, typename OUT_OF_MEM_CALLBACK, typename OUT_OF_CHUNK_POINTER_CALLBACK>
+__device__ __forceinline__ uint32_t completeChunkAlloc(uint32_t count, uint32_t* chunks, uint32_t* chunk_alloc, uint32_t chunk_size, void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, uint32_t* chunk_pointer_pos, OUT_OF_MEM_CALLBACK cb, OUT_OF_CHUNK_POINTER_CALLBACK ccb)
+{
+	//alloc chunk
+	uint32_t chunkoff;
+	int unused_worstCaseRemainder;
+	if (!allocChunk<VALUE_TYPE, INDEX_TYPE>(count, chunk_alloc, chunk_size, chunkoff, unused_worstCaseRemainder))
+	{
+		chunkoff = 0xFFFFFFFF;
+		cb();
+	}
+	else
+	{
+		//write chunk pointer
+		uint32_t chunk_pointer_position = atomicAdd(chunk_pointer_alloc, 1);
+		if (chunk_pointer_position >= chunk_pointer_sizes)
+		{
+			chunkoff = 0xFFFFFFFF;
+			if (chunk_pointer_position == chunk_pointer_sizes)
+				*chunk_pointer_pos = chunk_pointer_sizes;
+			ccb();
+		}
+		else
+		{
+			chunks_pointers[chunk_pointer_position] = reinterpret_cast<void*>(Chunk<VALUE_TYPE, INDEX_TYPE>::cast(chunks, chunkoff));
+		}		
+	}
+	return chunkoff;
+}
+
+
+
+template<typename LEFT_T, typename VALUE_TYPE, typename INDEX_TYPE>
+struct alignas(16) DirectChunk : public Chunk<VALUE_TYPE, INDEX_TYPE>
+{
+	using Chunk<VALUE_TYPE, INDEX_TYPE>::sort_key;
+	const INDEX_TYPE* indices;
+	const VALUE_TYPE* values;
+	LEFT_T multiplier;
+
+	__device__ __forceinline__ DirectChunk(uint32_t num, INDEX_TYPE firstrow, const INDEX_TYPE* indices, const VALUE_TYPE* values, LEFT_T multiplier, ChunkSortType sortkey = 0) :
+		Chunk<VALUE_TYPE, INDEX_TYPE>(num, firstrow, num, 0xFFFFFFFF, sortkey),
+		indices(indices), 
+		values(values), 
+		multiplier(multiplier)
+	{
+
+	}
+
+	__device__ __forceinline__ static uint32_t size(bool nextPointers)
+	{
+		uint32_t s = (nextPointers ? 16 : 0) + sizeof(DirectChunk);
+		return (s + 15) & 0xFFFFFFF0;
+	}
+
+	__device__ __forceinline__
+		static DirectChunk* place(void* chunks, uint32_t offset, uint32_t num, INDEX_TYPE firstrow, const INDEX_TYPE* indices, const VALUE_TYPE* values, LEFT_T multiplier, ChunkSortType sortkey = 0)
+	{
+		return new(reinterpret_cast<char*>(chunks) + offset) DirectChunk(num, firstrow, indices, values, multiplier, sortkey);
+	}
+	__device__ __forceinline__
+		static DirectChunk* cast(void* chunks, uint32_t offset)
+	{
+		return reinterpret_cast<DirectChunk*>(reinterpret_cast<char*>(chunks) + offset);
+	}
+	//__device__ __forceinline__ void write(void* location) const
+	//{
+	//	*reinterpret_cast<uint4*>(location)[0] = *reinterpret_cast<const uint4*>(this)[0];
+	//	*reinterpret_cast<uint4*>(location)[1] = *reinterpret_cast<const uint4*>(this)[1];
+	//}
+
+	__device__ __forceinline__ const VALUE_TYPE* values_direct(uint32_t count)
+	{
+		return values;
+	}
+	__device__ __forceinline__ const INDEX_TYPE* indices_direct(uint32_t count)
+	{
+		return indices;
+	}
+
+	__device__ __forceinline__ const VALUE_TYPE* values_direct(uint32_t count) const
+	{
+		return values;
+	}
+	__device__ __forceinline__ const INDEX_TYPE* indices_direct(uint32_t count) const
+	{
+		return indices;
+	}
+
+	__device__ __forceinline__ LEFT_T getMultiplier() const
+	{
+		return multiplier;
+	}
+};
+
+template<typename LEFT_T, typename VALUE_TYPE, typename INDEX_TYPE>
+__device__ __forceinline__ bool allocDirectChunk(uint32_t* chunk_alloc, uint32_t chunk_size, uint32_t& offset, bool nextPointers = true)
+{
+	uint32_t s = DirectChunk<LEFT_T, VALUE_TYPE, INDEX_TYPE>::size(nextPointers);
+	offset = atomicAdd(chunk_alloc, s) + (nextPointers ? 16 : 0);
+	return offset + s <= chunk_size;
+}
diff --git a/include/GALATIC/include/device/HelperFunctions.cuh b/include/GALATIC/include/device/HelperFunctions.cuh
new file mode 100644
index 00000000..6df121ee
--- /dev/null
+++ b/include/GALATIC/include/device/HelperFunctions.cuh
@@ -0,0 +1,877 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * HelperFunctions.cuh
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+
+#include <thrust/device_ptr.h>
+
+#include <memory>
+#include <algorithm>
+
+#include <stdint.h>
+#include "../meta_utils.h"
+#include "../devicetools/event.h"
+#include "../MergeCaseOffsets.h"
+
+namespace
+{
+	template <
+		typename IndexType,
+		typename ConversionOp,
+		typename OffsetT = ptrdiff_t>
+	class CustomGeneratorIterator
+	{
+	public:
+
+		// Required iterator traits
+		typedef CustomGeneratorIterator             self_type;              ///< My own type
+		typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+		typedef typename ConversionOp::value_type   value_type;             ///< The type of the element the iterator can point to
+		//typedef value_type*                        pointer;               ///< pointer not supported
+		typedef value_type                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+																																				// Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+		typedef typename thrust::detail::iterator_facade_category<
+			thrust::any_system_tag,
+			thrust::random_access_traversal_tag,
+			value_type,
+			reference
+		>::type iterator_category;                                        ///< The iterator category
+#else
+		typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+	private:
+
+		ConversionOp    conversion_op;
+		IndexType  id;
+
+	public:
+
+		/// Constructor
+		__host__ __device__ __forceinline__ CustomGeneratorIterator(
+			ConversionOp        conversion_op,      ///< Conversion functor to wrap
+			IndexType           base_id = 0)          ///< Input id to start at
+			:
+			conversion_op(conversion_op),
+			id(base_id)
+		{}
+
+		/// Postfix increment
+		__host__ __device__ __forceinline__ self_type operator++(int)
+		{
+			self_type retval = *this;
+			++id;
+			return retval;
+		}
+
+		/// Prefix increment
+		__host__ __device__ __forceinline__ self_type operator++()
+		{
+			++id;
+			return *this;
+		}
+
+		/// Indirection
+		__host__ __device__ __forceinline__ reference operator*() const
+		{
+			return conversion_op(id);
+		}
+
+		/// Addition
+		template <typename Distance>
+		__host__ __device__ __forceinline__ self_type operator+(Distance n) const
+		{
+			self_type retval(conversion_op, id + n);
+			return retval;
+		}
+
+		/// Addition assignment
+		template <typename Distance>
+		__host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+		{
+			id += n;
+			return *this;
+		}
+
+		/// Subtraction
+		template <typename Distance>
+		__host__ __device__ __forceinline__ self_type operator-(Distance n) const
+		{
+			self_type retval(conversion_op, id - n);
+			return retval;
+		}
+
+		/// Subtraction assignment
+		template <typename Distance>
+		__host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+		{
+			id -= n;
+			return *this;
+		}
+
+		/// Distance
+		__host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+		{
+			return id - other.id;
+		}
+
+		/// Array subscript
+		template <typename Distance>
+		__host__ __device__ __forceinline__ reference operator[](Distance n) const
+		{
+			return conversion_op(id + n);
+		}
+
+
+		/// Equal to
+		__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+		{
+			return (id == rhs.id);
+		}
+
+		/// Not equal to
+		__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+		{
+			return (id != rhs.id);
+		}
+	};
+
+
+	template <
+		typename            ValueType,
+		typename            Consume,
+		typename            OffsetT = ptrdiff_t>
+		class CustomOutputConsumerIterator
+	{
+	private:
+
+		// Proxy object
+		struct Reference
+		{
+			ValueType* ptr;
+			Consume consume;
+
+			/// Constructor
+			__host__ __device__ __forceinline__ Reference(ValueType* ptr, Consume consume) : ptr(ptr), consume(consume) {}
+
+			/// Assignment
+			__device__ __forceinline__ ValueType operator = (ValueType val)
+			{
+				consume(ptr, val);
+				return val;
+			}
+		};
+
+	public:
+
+		// Required iterator traits
+		typedef CustomOutputConsumerIterator        self_type;              ///< My own type
+		typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+		typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+		typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+		typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+																																				// Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+		typedef typename thrust::detail::iterator_facade_category<
+			thrust::device_system_tag,
+			thrust::random_access_traversal_tag,
+			value_type,
+			reference
+		>::type iterator_category;                                        ///< The iterator category
+#else
+		typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+	private:
+
+		Consume consume;
+		pointer ptr;
+		
+
+	public:
+
+		/// Constructor
+		__host__ __device__ __forceinline__ CustomOutputConsumerIterator(
+			Consume consume,
+			pointer ptr = nullptr)     ///< Native pointer to wrap
+			:
+			consume(consume),
+			ptr(ptr)
+		{}
+
+		/// Postfix increment
+		__host__ __device__ __forceinline__ self_type operator++(int)
+		{
+			self_type retval = *this;
+			ptr++;
+			return retval;
+		}
+
+
+		/// Prefix increment
+		__host__ __device__ __forceinline__ self_type operator++()
+		{
+			ptr++;
+			return *this;
+		}
+
+		/// Indirection
+		__host__ __device__ __forceinline__ reference operator*() const
+		{
+			return Reference(ptr, consume);
+		}
+
+		/// Addition
+		template <typename Distance>
+		__host__ __device__ __forceinline__ self_type operator+(Distance n) const
+		{
+			self_type retval(consume, ptr + n);
+			return retval;
+		}
+
+		/// Addition assignment
+		template <typename Distance>
+		__host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+		{
+			ptr += n;
+			return *this;
+		}
+
+		/// Subtraction
+		template <typename Distance>
+		__host__ __device__ __forceinline__ self_type operator-(Distance n) const
+		{
+			self_type retval(consume, ptr - n);
+			return retval;
+		}
+
+		/// Subtraction assignment
+		template <typename Distance>
+		__host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+		{
+			ptr -= n;
+			return *this;
+		}
+
+		/// Distance
+		__host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+		{
+			return ptr - other.ptr;
+		}
+
+		/// Array subscript
+		template <typename Distance>
+		__host__ __device__ __forceinline__ reference operator[](Distance n) const
+		{
+			return Reference(ptr + n, consume);
+		}
+
+		/// Equal to
+		__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+		{
+			return (ptr == rhs.ptr);
+		}
+
+		/// Not equal to
+		__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+		{
+			return (ptr != rhs.ptr);
+		}
+	};
+
+
+	template<class INDEX_TYPE, INDEX_TYPE MergeMaxElements, INDEX_TYPE MaxMergeChunks, class OUT_TYPE, int BITS, int NUM, int OFFSET>
+	class CaseCombinerConverter
+	{
+		const INDEX_TYPE* const maxPerRowElements;
+		const uint32_t* const sharedRows;
+		const uint32_t* const chunkCounter;
+	public:
+
+		typedef OUT_TYPE value_type;
+
+		__host__ __device__ __forceinline__
+			CaseCombinerConverter(const uint32_t* sharedRows, const INDEX_TYPE* maxPerRowElements, const uint32_t* chunkCounter) :
+			maxPerRowElements(maxPerRowElements),
+			sharedRows(sharedRows),
+			chunkCounter(chunkCounter)
+		{ }
+
+		__host__ __device__ __forceinline__
+		OUT_TYPE operator()(const uint32_t &id) const
+		{
+			uint32_t row = sharedRows[id];
+			uint32_t chunks = chunkCounter[row];
+			//INDEX_TYPE elementCounter = maxPerRowElements[row];
+			int type = 2;
+			if (chunks == 2 && maxPerRowElements[row] < MergeMaxElements)
+				type = 0;
+			else if ((chunks & (~MAX_CHUNKS_CASE)) < MaxMergeChunks && (chunks & CASE_DISTINCTION) == 0 /*&& false*/)
+				type = 1;
+
+			if (type < OFFSET || type >= OFFSET + NUM)
+				return 0;
+			OUT_TYPE	res = ((OUT_TYPE(1) << (BITS-1)) | OUT_TYPE(1)) << (BITS*(type - OFFSET));
+			/*if (blockIdx.x == 0)
+				printf("%d (%d) convert %d (%d) to %llx\n", id, sharedRows[id], elementCounter, type, (uint64_t)res);*/
+			return res;
+		}
+	};
+
+	template<class INDEX_TYPE, class IN_TYPE, int BITS, int NUM, int OFFSET>
+	class CaseSeparatorConsumer
+	{
+		INDEX_TYPE* const outputPointers;
+		INDEX_TYPE* const counters;
+		INDEX_TYPE* const row_counts;
+		const uint32_t* const sharedRows;
+		const uint32_t activeRows;
+	public:
+		__host__ __device__ __forceinline__
+			CaseSeparatorConsumer(const uint32_t* sharedRows, INDEX_TYPE* outputPointers, INDEX_TYPE* counters, uint32_t activeRows, INDEX_TYPE* row_counts) :
+			outputPointers(outputPointers),
+			counters(counters),
+			sharedRows(sharedRows),
+			row_counts{ row_counts },
+			activeRows(activeRows)
+		{ }
+
+		__host__ __device__ __forceinline__
+		void operator()(IN_TYPE* virtualOffset, const IN_TYPE sumresult) const
+		{
+			int type = -1;
+			INDEX_TYPE offset = 0;
+			const IN_TYPE mask = IN_TYPE(1) << (BITS - 1);
+			const IN_TYPE select = mask - 1;
+
+			IN_TYPE* virtualBase = nullptr;
+			uint32_t dist = virtualOffset - virtualBase;
+			if (dist == activeRows - 1)
+			{
+				// final writes counts
+				#pragma unroll
+				for (int i = 0; i < NUM; ++i)
+				{
+					if (i + OFFSET < 3)
+					{ 
+						INDEX_TYPE sum = static_cast<INDEX_TYPE>((sumresult >> (i*BITS)) & select);
+						counters[OFFSET + i] = sum;
+					}
+				}
+				counters[3] = 0;
+				outputPointers[(activeRows) * 3] = 0;
+			}
+
+			#pragma unroll
+			for (int i = 0; i < NUM; ++i)
+			{
+				if ((sumresult & (mask << (i*BITS))) != 0)
+				{
+					type = i;
+					offset = static_cast<INDEX_TYPE>((sumresult >> (i*BITS)) & select);
+				}
+			}
+			if (type == -1)
+			{
+				//if(blockIdx.x == 0)
+				//	printf("%d %d: %llx would not write\n", blockIdx.x, threadIdx.x, (uint64_t)sumresult);
+				return;
+			}
+
+			type += OFFSET;
+			//if (blockIdx.x == 0)
+			//	printf("%d %d: %llx would write %d to %d(%d) (%llx, %llx, %d   + %d)\n", blockIdx.x, threadIdx.x, (uint64_t)sumresult, offset, dist, type, outputPointers, counters, activeRows, activeRows*type + dist + 1);
+			//printf("%d %d: %llx writinting %d (%d) to %d (%d*%d + %d -1)\n", blockIdx.x, threadIdx.x, (uint64_t)sumresult, sharedRows[dist], dist, 
+			//	(activeRows)*type + offset - 1, activeRows, type,offset);
+			/*if (type == 0 && row_counts[sharedRows[dist]] > 1024)
+				printf("RowCount at position %u is : %u\n", sharedRows[dist], row_counts[sharedRows[dist]]);*/
+			outputPointers[(activeRows)*type + offset-1] = sharedRows[dist];
+		}
+	};
+
+	template<class TYPE, TYPE MASK>
+	struct CombinedAdd
+	{
+		/// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+		template <typename T>
+		__host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+		{
+			return (a & MASK) + b;
+		}
+	};
+
+
+	struct BlockOffsetRange
+	{
+		uint32_t begin, end;
+		uint32_t count;
+	};
+
+	template<class INDEX_TYPE>
+	class BlockOffsetCreator
+	{
+		const INDEX_TYPE* const maxPerRowElements;
+		const INDEX_TYPE* const sharedRows;
+	public:
+
+		typedef BlockOffsetRange value_type;
+
+		__host__ __device__ __forceinline__
+			BlockOffsetCreator(const INDEX_TYPE* sharedRows, const INDEX_TYPE* maxPerRowElements) :
+			maxPerRowElements(maxPerRowElements),
+			sharedRows(sharedRows)
+		{ }
+
+		__host__ __device__ __forceinline__
+		BlockOffsetRange operator()(const uint32_t &id) const
+		{
+			/*if(maxPerRowElements[sharedRows[id]] > 1024)*/
+			//printf("%d creating range: (row %d) %d-%d with %d\n", id, sharedRows[id], id, id + 1, maxPerRowElements[sharedRows[id]]);
+			return BlockOffsetRange{ id, id + 1, maxPerRowElements[sharedRows[id]] };
+		}
+	};
+
+	template<uint32_t SimpleMergeThreads, uint32_t MaxComb>
+	struct BlockOffsetCombiner
+	{
+		template <typename T>
+		__host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+		{
+			//we have to consider the maximum number elements the block can hold (Max Comb)
+			//and we have to consider the maximum number shared rows the block can handle (SimpleMergeThreads - 1) -> < SimpleMergeThreads
+			if (a.end == b.begin && a.count + b.count < MaxComb && b.end - a.begin < SimpleMergeThreads)
+			{
+				//printf("merging: %d<>%d %d<>%d %d + %d < %d\n", a.begin, a.end, b.begin, b.end, a.count, b.count, MaxComb);
+				return BlockOffsetRange{ a.begin, b.end, a.count + b.count};
+			}
+			else
+			{
+				//printf("not merging: %d<>%d %d<>%d %d + %d < %d\n", a.begin, a.end, b.begin, b.end, a.count, b.count, MaxComb);
+			}
+			//if (b.count >= 1024)
+			//	printf("B.count: %u is too large\n", b.count);
+			return b;
+		}
+	};
+
+	class BlockOffsetExtractor
+	{
+		uint2* const rangeOut;
+	public:
+		__host__ __device__ __forceinline__
+			BlockOffsetExtractor(uint2* rangeOut) :
+			rangeOut(rangeOut)
+		{ }
+
+		__host__ __device__ __forceinline__
+			void operator()(BlockOffsetRange* virtualOffset, const BlockOffsetRange result) const
+		{
+			BlockOffsetRange* virtualBase = nullptr;
+			uint32_t dist = virtualOffset - virtualBase;
+			/*if(result.count > 1024)*/
+				//printf("%d writing range (%d<>%d) %d | %u\n", dist, result.begin, result.end, result.count, result.test);
+			rangeOut[dist] = uint2{ result.begin, result.end };
+		}
+	};
+
+
+	class RangeStartTranslator
+	{
+		const uint2* __restrict__ ranges;
+		const uint32_t activeRows;
+	public:
+
+		typedef uint32_t value_type;
+
+		__host__ __device__ __forceinline__
+			RangeStartTranslator(const uint2* ranges, uint32_t activeRows) :
+			ranges(ranges),
+			activeRows(activeRows)
+		{ }
+
+		__host__ __device__ __forceinline__
+			uint32_t operator()(const uint32_t &id) const
+		{
+			uint32_t res = 0x80000001;
+			if (id < activeRows - 1)
+			{
+				if (ranges[id].x == ranges[id + 1].x)
+					res = 0;
+			}
+			
+			//if(res != 0)
+			//	printf("%d is a block end (%d<>%d)\n", id, ranges[id].x, ranges[id].y);
+
+			return res;
+		}
+	};
+
+	template<class INDEX_TYPE>
+	class BlockStartWriter
+	{
+		INDEX_TYPE* const blockOffsets;
+		INDEX_TYPE* counter;
+		const uint32_t activeRows;
+	public:
+		__host__ __device__ __forceinline__
+			BlockStartWriter(INDEX_TYPE* blockOffsets, INDEX_TYPE* counter, uint32_t activeRows) :
+			blockOffsets(blockOffsets),
+			counter(counter),
+			activeRows(activeRows)
+		{ }
+
+		__host__ __device__ __forceinline__
+			void operator()(uint32_t* virtualOffset, const uint32_t result) const
+		{
+			uint32_t* z = nullptr;
+			INDEX_TYPE d = virtualOffset - z;
+			if (d == activeRows - 1)
+			{
+				*counter = (result & (~0x80000000));
+			}
+			if (result & 0x80000000)
+			{
+				blockOffsets[result & (~0x80000000)] = d+1;
+				//printf("writing block offset %d : %d\n", result & (~0x80000000), d+1);
+			}
+		}
+	};	
+
+	struct PinnedHostMemDeleter
+	{
+		void operator()(void* p) const noexcept
+		{
+			cudaFreeHost(p);
+		}
+	};
+
+	template<typename T>
+	inline auto allocHostMemory(size_t elements = 1)
+	{
+		void* p;
+		cudaMallocHost(&p, elements * sizeof(T));
+		return std::unique_ptr<T, PinnedHostMemDeleter>(static_cast<T*>(p));
+	}
+}
+
+namespace std
+{
+	template<typename IndexType, typename ConversionOp, typename OffsetT>
+	struct iterator_traits<CustomGeneratorIterator<IndexType, ConversionOp, OffsetT>>
+	{
+		typedef typename CustomGeneratorIterator<IndexType, ConversionOp, OffsetT>::value_type value_type;
+	};
+}
+
+template<class INDEX_TYPE>
+size_t AcSpGEMMKernels::tempMemSize(size_t CRows)
+{
+	void *d_temp_storage = nullptr;
+	size_t temp_storage_bytes = 0;
+	size_t temp_storage_bytes2 = 0;
+	INDEX_TYPE *in = nullptr, *out = nullptr;
+	uint64_t *in64 = nullptr, *out64 = nullptr;
+	cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in, out, CRows + 1);
+	cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes2, in64, out64, CRows + 1);
+	temp_storage_bytes = std::max(temp_storage_bytes, temp_storage_bytes2);
+
+	CustomGeneratorIterator<uint32_t, BlockOffsetCreator<INDEX_TYPE>> initr(BlockOffsetCreator<INDEX_TYPE>(nullptr, nullptr));
+	CustomOutputConsumerIterator<BlockOffsetRange, BlockOffsetExtractor> outitr(BlockOffsetExtractor(nullptr));
+	cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes2, initr, outitr, BlockOffsetCombiner<1024, 1024>(), CRows);
+	temp_storage_bytes = std::max(temp_storage_bytes, temp_storage_bytes2);
+
+	CustomGeneratorIterator<uint32_t, RangeStartTranslator> initr2(RangeStartTranslator(nullptr, 0));
+	CustomOutputConsumerIterator<uint32_t, BlockStartWriter<INDEX_TYPE>> outitr2(BlockStartWriter<INDEX_TYPE>(nullptr, nullptr, 0));
+	cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes2, initr2, outitr2, CombinedAdd<uint32_t, ~0x80000000>(), CRows);
+
+	return std::max(temp_storage_bytes, temp_storage_bytes2) + (CRows * sizeof(uint2) + 15) / 16 * 16 + 16;
+}
+
+ template<class INDEX_TYPE, INDEX_TYPE MaxMergeChunks, INDEX_TYPE MergeMaxElements, uint32_t SimpleMergeBlockSize>
+ MergeCaseOffsets AcSpGEMMKernels::assignCombineBlocks(size_t activeRows, void* tempMem, size_t tempMemSize, uint32_t* sharedRows, CUdeviceptr maxPerRowElements, uint32_t* chunckCounter, CUdeviceptr per_block_offsets, CUdeviceptr num_merge_blocks, CUstream stream, CUstream overlapStream)
+ {
+ 	//const INDEX_TYPE MaxBlockLoad = DoubleMaxBlockLoad;
+ 	size_t outtempsize = (activeRows * sizeof(uint2) + 15) / 16 * 16;
+ 	size_t adjtempsize = tempMemSize - outtempsize;
+	void* temporaryMem = reinterpret_cast<void*>(reinterpret_cast<uint64_t>(tempMem) + outtempsize);
+
+ 	static auto blockCounter = allocHostMemory<INDEX_TYPE>(4);
+
+ 	//sum and offsets of merges
+ 	if (activeRows < ((1u<<9)-1))
+ 	{
+ 		// use 8 bits
+ 		CaseCombinerConverter<INDEX_TYPE, MergeMaxElements, MaxMergeChunks, uint32_t, 10, 3, 0> inConv(sharedRows, reinterpret_cast<INDEX_TYPE*>(maxPerRowElements), chunckCounter);
+ 		CustomGeneratorIterator<uint32_t, CaseCombinerConverter<INDEX_TYPE, MergeMaxElements, MaxMergeChunks, uint32_t, 10, 3, 0>> initr(inConv);
+
+ 		CaseSeparatorConsumer<INDEX_TYPE, uint32_t, 10, 3, 0> outConv(sharedRows, reinterpret_cast<INDEX_TYPE*>(per_block_offsets), reinterpret_cast<INDEX_TYPE*>(num_merge_blocks), activeRows, reinterpret_cast<INDEX_TYPE*>(maxPerRowElements));
+ 		CustomOutputConsumerIterator<uint32_t, CaseSeparatorConsumer<INDEX_TYPE, uint32_t, 10, 3, 0>> outitr(outConv);
+
+ 		CombinedAdd<uint32_t, (((1u << 9) - 1)<<20) | (((1u << 9) - 1)<<10) | ((1u << 9) - 1)> comb;
+ 		cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr, outitr, comb, activeRows, stream);
+ 	}
+ 	else if (activeRows < ((1u << 20) - 1))
+ 	{
+ 		// use 16 bit
+ 		CaseCombinerConverter<INDEX_TYPE, MergeMaxElements, MaxMergeChunks, uint64_t, 21, 3, 0> inConv(sharedRows, reinterpret_cast<INDEX_TYPE*>(maxPerRowElements), chunckCounter);
+ 		CustomGeneratorIterator<uint64_t, CaseCombinerConverter<INDEX_TYPE, MergeMaxElements, MaxMergeChunks, uint64_t, 21, 3, 0>> initr(inConv);
+
+ 		CaseSeparatorConsumer<INDEX_TYPE, uint64_t, 21, 3, 0> outConv(sharedRows, reinterpret_cast<INDEX_TYPE*>(per_block_offsets), reinterpret_cast<INDEX_TYPE*>(num_merge_blocks), activeRows, reinterpret_cast<INDEX_TYPE*>(maxPerRowElements));
+ 		CustomOutputConsumerIterator<uint64_t, CaseSeparatorConsumer<INDEX_TYPE, uint64_t, 21, 3, 0>> outitr(outConv);
+
+ 		CombinedAdd<uint64_t, (((1ull << 20) - 1) << 42) | (((1ull << 20) - 1) << 21) | ((1ull << 20) - 1)> comb;
+ 		cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr, outitr, comb, activeRows, stream);
+ 	}
+ 	else
+ 	{
+ 		// use 32 bit triple call
+ 		CombinedAdd<uint32_t, ~0x80000000> comb;
+
+ 		CaseCombinerConverter<INDEX_TYPE, MergeMaxElements, MaxMergeChunks, uint32_t, 32, 1, 0> inConv0(sharedRows, reinterpret_cast<INDEX_TYPE*>(maxPerRowElements), chunckCounter);
+ 		CustomGeneratorIterator<uint32_t, CaseCombinerConverter<INDEX_TYPE, MergeMaxElements, MaxMergeChunks, uint32_t, 32, 1, 0>> initr0(inConv0);
+
+ 		CaseSeparatorConsumer<INDEX_TYPE, uint32_t, 32, 1, 0> outConv0(sharedRows, reinterpret_cast<INDEX_TYPE*>(per_block_offsets), reinterpret_cast<INDEX_TYPE*>(num_merge_blocks), activeRows, reinterpret_cast<INDEX_TYPE*>(maxPerRowElements));
+ 		CustomOutputConsumerIterator<uint32_t, CaseSeparatorConsumer<INDEX_TYPE, uint32_t, 32, 1, 0>> outitr0(outConv0);
+ 		cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr0, outitr0, comb, activeRows, stream);
+
+ 		CaseCombinerConverter<INDEX_TYPE, MergeMaxElements, MaxMergeChunks, uint32_t, 32, 1, 1> inConv1(sharedRows, reinterpret_cast<INDEX_TYPE*>(maxPerRowElements), chunckCounter);
+ 		CustomGeneratorIterator<uint32_t, CaseCombinerConverter<INDEX_TYPE, MergeMaxElements, MaxMergeChunks, uint32_t, 32, 1, 1>> initr1(inConv1);
+
+ 		CaseSeparatorConsumer<INDEX_TYPE, uint32_t, 32, 1, 1> outConv1(sharedRows, reinterpret_cast<INDEX_TYPE*>(per_block_offsets), reinterpret_cast<INDEX_TYPE*>(num_merge_blocks), activeRows, reinterpret_cast<INDEX_TYPE*>(maxPerRowElements));
+ 		CustomOutputConsumerIterator<uint32_t, CaseSeparatorConsumer<INDEX_TYPE, uint32_t, 32, 1, 1>> outitr1(outConv1);
+ 		cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr1, outitr1, comb, activeRows, stream);
+
+ 		CaseCombinerConverter<INDEX_TYPE, MergeMaxElements, MaxMergeChunks, uint32_t, 32, 1, 2> inConv2(sharedRows, reinterpret_cast<INDEX_TYPE*>(maxPerRowElements), chunckCounter);
+ 		CustomGeneratorIterator<uint32_t, CaseCombinerConverter<INDEX_TYPE, MergeMaxElements, MaxMergeChunks, uint32_t, 32, 1, 2>> initr2(inConv2);
+
+ 		CaseSeparatorConsumer<INDEX_TYPE, uint32_t, 32, 1, 2> outConv2(sharedRows, reinterpret_cast<INDEX_TYPE*>(per_block_offsets), reinterpret_cast<INDEX_TYPE*>(num_merge_blocks), activeRows, reinterpret_cast<INDEX_TYPE*>(maxPerRowElements));
+ 		CustomOutputConsumerIterator<uint32_t, CaseSeparatorConsumer<INDEX_TYPE, uint32_t, 32, 1, 2>> outitr2(outConv2);
+ 		cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr2, outitr2, comb, activeRows, stream);
+ 	}
+
+ 	cudaMemcpy(blockCounter.get(), reinterpret_cast<void*>(num_merge_blocks), 3 * sizeof(INDEX_TYPE), cudaMemcpyDeviceToHost);
+ 	uint32_t combSharedRows = blockCounter.get()[0];
+
+ 	{
+ 		BlockOffsetCreator<INDEX_TYPE> rangeCreator(reinterpret_cast<INDEX_TYPE*>(per_block_offsets), reinterpret_cast<INDEX_TYPE*>(maxPerRowElements));
+ 		CustomGeneratorIterator<uint32_t, BlockOffsetCreator<INDEX_TYPE>> initr(rangeCreator);
+
+ 		BlockOffsetCombiner<SimpleMergeBlockSize, MergeMaxElements> comb;
+
+ 		BlockOffsetExtractor rangeExtractor(reinterpret_cast<uint2*>(tempMem));
+ 		CustomOutputConsumerIterator<BlockOffsetRange, BlockOffsetExtractor> outitr(rangeExtractor);
+ 		cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr, outitr, comb, combSharedRows, stream);
+ 	}
+
+ 	{
+ 		RangeStartTranslator rangeCreator(reinterpret_cast<uint2*>(tempMem), combSharedRows);
+ 		CustomGeneratorIterator<uint32_t, RangeStartTranslator> initr(rangeCreator);
+
+ 		CombinedAdd<uint32_t, ~0x80000000> comb;
+
+ 		BlockStartWriter<INDEX_TYPE> rangeExtractor(reinterpret_cast<INDEX_TYPE*>(per_block_offsets) + 3 * activeRows, reinterpret_cast<INDEX_TYPE*>(num_merge_blocks) + 3, combSharedRows);
+ 		CustomOutputConsumerIterator<uint32_t, BlockStartWriter<INDEX_TYPE>> outitr(rangeExtractor);
+ 		cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr, outitr, comb, combSharedRows, stream);
+ 	}
+
+	cudaMemcpy(blockCounter.get(), reinterpret_cast<void*>(num_merge_blocks), 4 * sizeof(INDEX_TYPE), cudaMemcpyDeviceToHost);
+
+ 	return MergeCaseOffsets(blockCounter.get()[3], blockCounter.get()[1], blockCounter.get()[2], blockCounter.get()[0]);
+ }
+
+ template<class INDEX_TYPE>
+ void AcSpGEMMKernels::computeRowOffsets(size_t Crows, void* tempMem, size_t tempMemSize, CUdeviceptr inout, CUstream stream)
+ {
+ 	INDEX_TYPE* workmem = reinterpret_cast<INDEX_TYPE*>(inout);
+ 	cub::DeviceScan::ExclusiveSum(tempMem, tempMemSize, workmem, workmem, Crows + 1, stream);
+ }
+
+__forceinline__ __device__ unsigned laneid()
+{
+    unsigned ret; 
+    asm volatile ("mov.u32 %0, %%laneid;" : "=r"(ret));
+    return ret;
+}
+
+template<class T, int N>
+__device__ __forceinline__ void updateMinValue(T &sv, T(&values)[N], int num = N)
+{
+	typename cub::WarpReduce< T >::TempStorage nosmem;
+	T v = sv;
+	#pragma unroll
+	for (int i = 0; i < N; ++i)
+		if (i < num)
+			v = min(v, values[i]);
+
+	T res = cub::WarpReduce< T >(nosmem).Reduce(v, cub::Min());
+	if (laneid() == 0)
+		atomicMin(&sv, res);
+}
+
+template<class T>
+__device__ __forceinline__ void updateMinValue(T &sv, T v)
+{
+	typename cub::WarpReduce< T >::TempStorage nosmem;
+	T res = cub::WarpReduce< T >(nosmem).Reduce(v, cub::Min());
+	if (laneid() == 0)
+		atomicMin(&sv, res);
+}
+
+template<class T>
+__device__ __forceinline__ void updateMaxValue(T &sv, T v)
+{
+	typename cub::WarpReduce< T >::TempStorage nosmem;
+
+	T res = cub::WarpReduce< T >(nosmem).Reduce(v, cub::Max());
+	if (laneid() == 0)
+		atomicMax(&sv, res);
+}
+
+template<class T, int N>
+__device__ __forceinline__ void updateMaxValue(T &sv, T(&values)[N], int num = N)
+{
+	typename cub::WarpReduce< T >::TempStorage nosmem;
+	T v = sv;
+	#pragma unroll
+	for (int i = 0; i < N; ++i)
+		if (i < num)
+			v = max(v, values[i]);
+
+	T res = cub::WarpReduce< T >(nosmem).Reduce(v, cub::Max());
+	if (laneid() == 0)
+		atomicMax(&sv, res);
+}
+
+template<uint32_t X, int Completed = 0>
+struct count_clz
+{
+	static const uint32_t value = (X & 0x80000000) ? Completed : static_clz< (X << 1), Completed + 1 >::value;
+};
+template<uint32_t X>
+struct count_clz<X, 32>
+{
+	static const uint32_t value = 32;
+};
+
+template<uint32_t BITS>
+struct ChooseBitDataTypeRounded;
+template<>
+struct ChooseBitDataTypeRounded<8>
+{
+	using type = uint8_t;
+};
+template<>
+struct ChooseBitDataTypeRounded<16>
+{
+	using type = uint16_t;
+};
+template<>
+struct ChooseBitDataTypeRounded<32>
+{
+	using type = uint32_t;
+};
+template<>
+struct ChooseBitDataTypeRounded<64>
+{
+	using type = uint64_t;
+};
+
+template<uint32_t BITS>
+struct ChooseBitDataTypeRounding
+{
+	using type = typename ChooseBitDataTypeRounded<BITS <= 8 ? 8 : BITS <= 16 ? 16 : BITS <= 32 ? 32 : BITS <= 64 ? 64 : BITS <= 96 ? 96 : 128>::type;
+};
+
+template<uint32_t BITS>
+using  ChooseBitDataType = typename ChooseBitDataTypeRounding<BITS>::type;
+
+
+template<typename VALUE_TYPE, typename INDEX_TYPE>
+__device__ __forceinline__
+void addPotentiallySharedRow(uint32_t row, Chunk<VALUE_TYPE, INDEX_TYPE> * chunk, bool first_row,
+	void** output_row_list_heads, uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, bool force_addlist = false)
+{
+	unsigned long long* rlh = reinterpret_cast<unsigned long long*>(output_row_list_heads);
+	unsigned long long c = reinterpret_cast<unsigned long long>(chunk) | (first_row?2:0);
+	uint64_t next = atomicExch(rlh + row, c);
+	bool addlist = false;
+	if (next == 0)
+	{
+		if(force_addlist)
+			addlist = true;
+		else
+		{
+			//we are first, so mark that next needs to add to list
+			uint64_t set = atomicCAS(rlh + row, c, (c | 0x1));
+			if (set != c)
+				//someone else added to the list before we could mark for setting shared list, so we have to do it
+				addlist = true;
+		}
+	}
+	else if ((next & 0x1) != 0)
+	{
+		addlist = true;
+		next = next & 0xFFFFFFFFFFFFFFFEULL;
+	}
+
+	chunk->writeNextPointer(reinterpret_cast<Chunk<VALUE_TYPE, INDEX_TYPE>*>(next), first_row);
+
+	uint32_t p = static_cast<uint32_t>(-1);
+	if (addlist)
+	{
+		p = atomicAdd(shared_rows_alloc, 1);
+		shared_rows_tracker[p] = row;
+	}
+}
+
+// ########################################################################
+// Explicit instantiations
+// ########################################################################
+template size_t AcSpGEMMKernels::tempMemSize<uint32_t>(size_t CRows);
+template void AcSpGEMMKernels::computeRowOffsets<uint32_t>(size_t Crows, void* tempMem, size_t tempMemSize, CUdeviceptr inout, CUstream stream);
+#define GPUCompressedMatrixMatrixMultiplyHelper(THREADS, TEMPPERTHREAD, MERGEMAXCHUNKS) \
+template MergeCaseOffsets AcSpGEMMKernels::assignCombineBlocks<uint32_t, MERGEMAXCHUNKS, (TEMPPERTHREAD * 2) * THREADS, THREADS>(size_t activeRows, void* tempMem, size_t tempMemSize, uint32_t* sharedRows, CUdeviceptr maxPerRowElements, uint32_t* chunckCounter, CUdeviceptr per_block_offsets, CUdeviceptr num_merge_blocks, CUstream stream, CUstream overlapStream);
+
diff --git a/include/GALATIC/include/device/MultiplyKernels.h b/include/GALATIC/include/device/MultiplyKernels.h
new file mode 100644
index 00000000..8bf27fb3
--- /dev/null
+++ b/include/GALATIC/include/device/MultiplyKernels.h
@@ -0,0 +1,221 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * MultiplyKernels.h
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include "../SemiRingInterface.h"
+#include <stdint.h>
+#include <tuple>
+#include <cuda.h>
+#include "../MergeCaseOffsets.h"
+
+const int RESTART_OFF = 0;
+const int RESTART_WRONG_CASE = 1;
+const int RESTART_FIRST_ITERATION = 2;
+const int RESTART_ITERATION_FINISH = 3;
+const int RESTART_ITERATION_UNKNOWN =  4;
+const int helper_overhead = 4;
+#define WARP_SIZE 32
+#define MAX_CHUNKS_CASE 0x80000000
+#define GENERALIZED_CASE 0xC0000000
+#define CASE_DISTINCTION 0x40000000 // MAX_CHUNKS_CASE - GENERALIZED_CASE
+
+// Debugging
+#define ROW_TO_INVESTIGATE 2579
+
+#define ENABLE_SORTING
+
+
+
+//###################################################
+// Tagged unions  /  "enums"
+
+template<typename T, typename U>
+struct Either {
+	union Data {
+		T tee;
+		U you;
+	};
+
+	Data data;
+	unsigned char tag;
+
+	__device__ __host__ bool isFirst() const {
+		return tag == 0;
+	}
+
+	__device__ __host__ bool isSecond() const {
+		return tag == 1;
+	}
+
+	__device__ __host__ const T& valFirst() const {
+		return data.tee;
+	}
+	__device__ __host__ const U& valSecond() const  {
+		return data.you;
+	}
+
+	static  __device__ __host__ Either First(T te) {
+		Either result;
+		result.data.tee = te;
+		result.tag = 0;
+		return result;
+	}
+	static __device__ __host__ Either Second(U u) {
+		Either result;
+		result.data.you = u;
+		result.tag = 1;
+		return result;
+	}
+	__device__ __host__ Either () {}
+};
+
+
+
+
+
+class AcSpGEMMKernels
+{
+public:
+	AcSpGEMMKernels(uint32_t blockDim=128):
+	blockDim{blockDim}
+	{}
+
+	void setLaunchDimensions(uint32_t _gridDim, cudaStream_t _stream = 0, uint32_t _blockDim = 128)
+	{
+		gridDim = _gridDim;
+		blockDim = _blockDim;
+		stream = _stream;
+	}
+
+	// #####################################################################
+	// Determine Block Starts
+	//
+	template<typename OFFSET_TYPE, uint32_t NNZ_PER_BLOCK>
+	void h_DetermineBlockStarts(int num_other, const uint32_t*__restrict offsets, uint32_t* startingIds, uint64_t* toClear, 
+	uint32_t* toClear1, uint32_t* toClear2, int num3, uint32_t* toClear3, int num4, uint32_t* toClear4, 
+		int num5, uint32_t* toClear5, uint32_t* toClear6, uint32_t* toClear7, int num8, uint32_t* toClear8);
+
+	// #####################################################################
+	// SpGEMM stage
+	//
+	template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD, uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE1, typename VALUE_TYPE2, typename VALUE_TYPE3, typename INDEX_TYPE, typename OFFSET_TYPE, int SORT_TYPE_MODE,
+            typename T, typename U, typename Label,
+            typename SEMIRING_t>
+            void h_computeSpgemmPart(
+	const typename SEMIRING_t::leftInput_t* valA, const INDEX_TYPE* indicesA, const OFFSET_TYPE* __restrict offsetsA,
+	/*fixme const T2 -> */const typename SEMIRING_t::rightInput_t* __restrict valB, const INDEX_TYPE* __restrict indicesB, const OFFSET_TYPE* __restrict offsetsB,
+	const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows,
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	OFFSET_TYPE* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count,
+	uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv,
+	uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos, SEMIRING_t semiring);
+	// #####################################################################
+	// Merge Chunks Simple
+	//
+	template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD, uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_CHUNKS, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE, bool LONG_SORT,
+            typename T, typename U, typename Label,
+            typename SEMIRING_t>
+            void h_mergeSharedRowsSimple(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads,
+	OFFSET_TYPE* output_row_count,
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring);
+
+	// #####################################################################
+	// Merge Chunks Max Chunks
+	//
+	template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD,
+	        uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_CHUNKS, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE,
+	        typename INDEX_TYPE, typename OFFSET_TYPE,
+	        typename T, typename U, typename Label,
+	        typename SEMIRING_t>
+	        void h_mergeSharedRowsMaxChunks(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads,
+	OFFSET_TYPE* output_row_count, uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled,
+	INDEX_TYPE** restart_chunkIndices, Either<typename SEMIRING_t::rightInput_t*, typename SEMIRING_t::output_t*>* restart_chunkValues, typename SEMIRING_t::leftInput_t* restart_multiplier, uint32_t* restart_chunkElementCount, uint32_t restart_offset, uint32_t* restart_num_chunks, uint32_t* chunk_pointer_pos, SEMIRING_t semiring);
+
+	// #####################################################################
+	// Merge Chunks Generalized
+	//
+	template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD, uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_CHUNKS, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE,
+            typename T, typename U, typename Label,
+            typename SEMIRING_t>
+	void h_mergeSharedRowsGeneralized(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads,
+	OFFSET_TYPE* output_row_count,
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled,
+	uint32_t* restart_sampleOffs, uint32_t* restart_chunkElementsConsumedAndPath, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring);
+
+	// #####################################################################
+	// Copy Chunks into CSR format
+	//
+	template< typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE>
+	void h_copyChunks(void* const* __restrict chunks_pointers, const uint32_t* __restrict chunk_pointer_alloc, 
+	VALUE_TYPE * value_out, INDEX_TYPE * index_out, const uint32_t* __restrict result_offets);
+
+	// #####################################################################
+	// Calculate temporary memory size
+	//
+	template<class INDEX_TYPE>
+	size_t tempMemSize(size_t CRows);
+
+	// #####################################################################
+	// Merge Case assignment
+	//
+	 template<class INDEX_TYPE, INDEX_TYPE MaxMergeChunks, INDEX_TYPE MergeMaxElements, uint32_t SimpleMergeBlockSize>
+	 MergeCaseOffsets assignCombineBlocks(size_t activeRows, void* tempMem, size_t tempMemSize, uint32_t* sharedRows, CUdeviceptr maxPerRowElements, uint32_t* chunckCounter, CUdeviceptr per_block_offsets, CUdeviceptr num_merge_blocks, CUstream stream = 0, CUstream overlapStream = 0);
+
+	 // #####################################################################
+	 // Compute CSR offsets
+	 //
+	 template<class INDEX_TYPE>
+	 void computeRowOffsets(size_t Crows, void* tempMem, size_t tempMemSize, CUdeviceptr inout, CUstream stream = 0);
+
+	
+private:
+	uint32_t blockDim;
+	uint32_t gridDim;
+	cudaStream_t stream;
+};
+
diff --git a/include/GALATIC/include/device/SortAndCombine.cuh b/include/GALATIC/include/device/SortAndCombine.cuh
new file mode 100644
index 00000000..13044dad
--- /dev/null
+++ b/include/GALATIC/include/device/SortAndCombine.cuh
@@ -0,0 +1,209 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * SortAndCombine.cuh
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+#include <stdio.h>
+#include <cub/cub.cuh>
+
+template<typename SORTINDEX_TYPE, typename VALUE_TYPE, uint32_t THREADS, uint32_t ELEMENTS_PER_THREAD>
+class SortAndCombine
+{
+	template<typename SameElement, typename SameRow, typename SEMIRING_t>
+            class CombinerOp
+	{
+		SameElement sameElement;
+		SameRow sameRow;
+		SEMIRING_t semiring;
+	public:
+		__device__ __forceinline__ CombinerOp(SameElement sameElement, SameRow sameRow, SEMIRING_t semiring) :
+			sameElement(sameElement), 
+			sameRow(sameRow),
+			semiring(semiring)
+		{ }
+		template <typename T>
+		__device__ __forceinline__  T operator()(const  T &a, const  T &b) const
+		{
+			//T comb;
+			//uint32_t ca = a.key.get();
+
+			//////0x1 means we have to add over
+			////if (ca & 0x1)
+			////	comb.value = b.value;
+			////else
+			////	comb.value = a.value + b.value;
+
+			////same as above, just without conditional
+
+			//float amul = 1.0f - __int_as_float((ca & 0x1) * __float_as_int(1.0f));
+			//comb.value = a.value * amul + b.value;
+
+			//// in case we are at the end of the combine elements, we want to increase both by one
+			//uint32_t modca = ca + ((ca & 0x1) * 0x00020002);
+			//// we need to add the parts that are outside of the mask
+			//uint32_t amask = ((ca & 0x10000) * 0xFFFE) ^ 0xFFFEFFFE;
+			//// in case a new row starts, we need to reset the front part
+			//uint32_t res = (modca & amask) + b.key.get();
+
+			//comb.key = decltype(comb.key)(res);
+			//return comb;
+
+
+			uint32_t newastate = (!sameRow(a.index, b.index)) ? (a.getState() & 0xFFFE) : (a.getState() & 0xFFFEFFFE);
+			//decltype(a.value) amul = sameElement(a.index, b.index) ? SEMIRING_t::MultiplicativeIdentity() : SEMIRING_t::AdditiveIdentity()  ;
+			return  T(b.index, semiring.add( sameElement(a.index, b.index) ?  a.value :  (SEMIRING_t::AdditiveIdentity())  , b.value), newastate + b.getState());
+
+		}
+	};
+public:
+	class CombResult
+	{
+		uint32_t state;
+	public:
+		SORTINDEX_TYPE index;
+		VALUE_TYPE value;
+
+		__device__ __forceinline__ CombResult() = default;
+
+		__device__ __forceinline__ CombResult(SORTINDEX_TYPE index, VALUE_TYPE value, uint32_t state = 0) : 
+			index(index), value(value), state(state)
+		{ }
+
+		__device__ __forceinline__ CombResult(SORTINDEX_TYPE index, VALUE_TYPE value, bool endElement, bool endRow) :
+			index(index), value(value), state((endRow ? 0x10000 : 0) | (endElement ? 0x20003 : 0))
+		{ }
+
+		__device__ __forceinline__ uint32_t getState() const
+		{
+			return state;
+		}
+		__device__ __forceinline__ uint32_t memoffset() const
+		{
+			return ((state >> 1) & 0x7FFF) -1;
+		}
+		__device__ __forceinline__ uint32_t rowcount() const
+		{
+			return state >> 17;
+		}
+		__device__ __forceinline__ bool isResult() const
+		{
+			return (state & 0x1) != 0;
+		}
+		__device__ __forceinline__ bool isRowend() const
+		{
+			return ((state >> 16) & 0x1) != 0;
+		}
+	};
+
+
+	using CUBCombIndexValueSort = cub::BlockRadixSort<SORTINDEX_TYPE, THREADS, ELEMENTS_PER_THREAD, VALUE_TYPE>;
+	using ScanCombinerEntry = CombResult;
+	using CUBScanCombiner = cub::BlockScan<ScanCombinerEntry, THREADS>;
+
+	union SMem
+	{
+		typename CUBCombIndexValueSort::TempStorage combIndexValueSortTempMem;
+		typename CUBScanCombiner::TempStorage combinerScanTempMem;
+		SORTINDEX_TYPE threadFirstElementIdentifier[THREADS + 1];
+	};
+	
+	template<typename SameElement, typename SameRow, typename SEMIRING_t>
+	__device__ __forceinline__ 
+	static uint32_t combine(SMem& smem,
+		SORTINDEX_TYPE (&combIndex)[ELEMENTS_PER_THREAD], typename SEMIRING_t::output_t (&data)[ELEMENTS_PER_THREAD], ScanCombinerEntry(&combinedEntries)[ELEMENTS_PER_THREAD],
+		SameElement sameElement, SameRow sameRow, SEMIRING_t semiring, uint32_t sortbits = sizeof(SORTINDEX_TYPE)*8)
+	{
+
+		//sort according to RowA/ColumnB (together with shared content)
+		CUBCombIndexValueSort(smem.combIndexValueSortTempMem).Sort(combIndex, data, 0, sortbits);
+		__syncthreads();
+		
+		
+		//figure out who has the last element to be combined
+		smem.threadFirstElementIdentifier[THREADS] = static_cast<SORTINDEX_TYPE>(-1);
+		smem.threadFirstElementIdentifier[threadIdx.x] = combIndex[0];
+		__syncthreads();
+
+
+		SORTINDEX_TYPE c = combIndex[ELEMENTS_PER_THREAD - 1];
+		SORTINDEX_TYPE oc = smem.threadFirstElementIdentifier[threadIdx.x + 1];
+	
+		combinedEntries[ELEMENTS_PER_THREAD - 1] = CombResult(combIndex[ELEMENTS_PER_THREAD - 1], data[ELEMENTS_PER_THREAD - 1], !sameElement(c, oc), !sameRow(c, oc));
+
+
+		#pragma unroll
+		for (int i = 0; i < ELEMENTS_PER_THREAD - 1; ++i)
+		{
+			SORTINDEX_TYPE c = combIndex[i];
+			SORTINDEX_TYPE oc = combIndex[i + 1];
+
+			combinedEntries[i] = CombResult(combIndex[i], data[i], !sameElement(c, oc), !sameRow(c, oc));
+		}
+
+		__syncthreads();
+
+
+		//segmented prefix sum to add up / get mem offset for new data
+		ScanCombinerEntry accumulate;
+		CUBScanCombiner(smem.combinerScanTempMem).InclusiveScan(combinedEntries, combinedEntries, CombinerOp<SameElement, SameRow, SEMIRING_t>(sameElement, sameRow,semiring), accumulate);
+		//uint32_t outputData = tempData + min(TEMP_PER_THREAD * THREADS, TEMP_PER_THREAD * THREADS + RowelementWorkDistribution::workAvailable(smem.workdistributionMem));
+		uint32_t count = accumulate.memoffset() + 1;
+
+
+		return count;
+	}
+};
+
+template<uint32_t MaxChunks, size_t PerChunkBits>
+struct PathMergerOp
+{
+	template <typename T>
+	__device__ __forceinline__ T operator()(const T &a, const T &b) const
+	{
+		const T Mask = (1 << PerChunkBits) - 1;
+		T res = 0;
+		#pragma unroll
+		for (uint32_t i = 0; i < MaxChunks; ++i)
+		{
+			T tmask = Mask << static_cast<T>(i*PerChunkBits);
+			res = res | (max(a & tmask, b & tmask));
+		}
+		return res;
+	}
+};
diff --git a/include/GALATIC/include/device/WorkDistribution.cuh b/include/GALATIC/include/device/WorkDistribution.cuh
new file mode 100644
index 00000000..9ca72f7a
--- /dev/null
+++ b/include/GALATIC/include/device/WorkDistribution.cuh
@@ -0,0 +1,328 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * ChunkstoCSR.cuh
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include <cub/cub.cuh>
+
+
+template<int THREADS, int ELEMENTS_PER_THREAD_IN = 1>
+class WorkDistribution
+{
+public:
+	typedef cub::BlockScan<int, THREADS> SimpleScanT;
+
+	struct SharedMemT
+	{
+		int work_sum[THREADS*ELEMENTS_PER_THREAD_IN + 1];
+	};
+
+	using SharedTempMemT = typename SimpleScanT::TempStorage;
+
+	template<int MAX_ELEMENTS_PER_THREAD_OUT = 1>
+	struct SharedTempMemOutT
+	{
+		int work_offsets[THREADS*MAX_ELEMENTS_PER_THREAD_OUT];
+	};
+	
+
+	template<bool BLOCKIN>
+	__device__ __forceinline__
+	static void initialize(SharedMemT& smem, SharedTempMemT& sum_space, int (&thread_work_count)[ELEMENTS_PER_THREAD_IN])
+	{
+		int* work_sum = smem.work_sum;
+		
+		if (!BLOCKIN && ELEMENTS_PER_THREAD_IN > 1)
+		{
+			//change from interleaved to blocked
+			#pragma unroll
+			for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i)
+				work_sum[threadIdx.x + i * THREADS + 1] = thread_work_count[i];
+			__syncthreads();
+			#pragma unroll
+			for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i)
+				thread_work_count[i] = work_sum[threadIdx.x * ELEMENTS_PER_THREAD_IN + i + 1];
+		}
+		SimpleScanT(sum_space).InclusiveSum(thread_work_count, thread_work_count);
+		#pragma unroll
+		for(int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i)
+		{
+			work_sum[threadIdx.x * ELEMENTS_PER_THREAD_IN + i + 1] = thread_work_count[i];
+		}
+		work_sum[0] = 0;
+		__syncthreads();
+	}
+
+	template<bool BLOCKOUT, int MAX_ELEMENTS_PER_THREAD_OUT>
+	__device__ __forceinline__
+	static int assignWorkAllThreads(SharedMemT& smem, SharedTempMemT& sum_space, SharedTempMemOutT<MAX_ELEMENTS_PER_THREAD_OUT>& tempmem, 
+		int (&work_element_out)[MAX_ELEMENTS_PER_THREAD_OUT], int(&within_element_id)[MAX_ELEMENTS_PER_THREAD_OUT], 
+		int num_distribute = MAX_ELEMENTS_PER_THREAD_OUT*THREADS)
+	{
+		int* work_sum = smem.work_sum;
+		int* work_offsets = tempmem.work_offsets;
+
+		// clear work offsets
+		#pragma unroll
+		for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i)
+			work_offsets[i*THREADS + threadIdx.x] = 0;
+		
+		__syncthreads();
+
+		// compute which thread should start with a given work element
+		#pragma unroll
+		for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i)
+		{
+			int v = work_sum[i*THREADS + threadIdx.x];
+			int vn = work_sum[i*THREADS + threadIdx.x + 1];
+			if (v < MAX_ELEMENTS_PER_THREAD_OUT*THREADS && v != vn)
+				work_offsets[v] = i*THREADS + threadIdx.x;
+		}
+
+		__syncthreads();
+		
+		//compute max per thread elements
+		num_distribute = min(num_distribute, work_sum[THREADS*ELEMENTS_PER_THREAD_IN]);
+
+		// read my offset (can be the right offset or zero as only the first one will have the right per input element)
+		#pragma unroll
+		for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i)
+		{
+			//if (MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i < num_distribute)
+				work_element_out[i] = work_offsets[MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i];
+			//else
+			//work_element_out[i] = 0;
+		}
+
+
+		SimpleScanT(sum_space).InclusiveScan(work_element_out, work_element_out, cub::Max());
+
+		int outElements = MAX_ELEMENTS_PER_THREAD_OUT;
+		if (!BLOCKOUT)
+		{
+
+			__syncthreads();
+
+			//stripped layout requires another trip through shared..
+			#pragma unroll
+			for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i)
+					work_offsets[MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i] = work_element_out[i];
+
+			__syncthreads();
+
+			// run from back to front so we can just decrese the count iif elements cross thread boundaries (same as below, just with different indices)
+			#pragma unroll
+			for (int i = MAX_ELEMENTS_PER_THREAD_OUT-1; i >= 0; --i)
+			{
+				if (i*THREADS + threadIdx.x < num_distribute)
+				{
+					work_element_out[i] = work_offsets[threadIdx.x + i*THREADS];
+					int workoffset = (threadIdx.x + i*THREADS);
+					within_element_id[i] = work_sum[work_element_out[i] + 1] - workoffset - 1;
+
+					//if ((within_element_id[i] < 0 && i + 1 < outElements) || (workoffset >= num_distribute))
+					//	outElements = i + 1;
+				}
+				else
+				{
+					outElements = i;
+					work_element_out[i] = -1;
+					within_element_id[i] = -1;
+				}
+			}
+		}
+		else
+		{ 
+			// run from back to front so we can just decrese the count iif elements cross thread boundaries
+			#pragma unroll
+			for (int i = MAX_ELEMENTS_PER_THREAD_OUT - 1; i >= 0; --i)
+			{
+				int workoffset = (MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x+i);
+				within_element_id[i] = work_sum[work_element_out[i] + 1] - workoffset - 1;
+				if (workoffset >= num_distribute)
+					outElements = i;
+			}
+		}
+
+		__syncthreads();
+
+		// update counts
+		#pragma unroll
+		for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i)
+		{
+			work_sum[threadIdx.x + i*THREADS + 1] = max(0,work_sum[threadIdx.x + i*THREADS + 1] - num_distribute);
+			//	printf("nwork: %d %d : %d\n", blockIdx.x, threadIdx.x + i*THREADS + 1, work_sum[threadIdx.x + i*THREADS + 1]);
+		}
+
+		__syncthreads();
+
+		return outElements;
+	}
+
+	template<bool BLOCKOUT, int MAX_ELEMENTS_PER_THREAD_OUT>
+	__device__ __forceinline__
+		static int assignWorkAllThreads_depricated(SharedMemT& smem, SharedTempMemT& sum_space, SharedTempMemOutT<MAX_ELEMENTS_PER_THREAD_OUT>& tempmem,
+			int(&work_element_out)[MAX_ELEMENTS_PER_THREAD_OUT], int(&within_element_id)[MAX_ELEMENTS_PER_THREAD_OUT],
+			uint32_t* max_A_entry, uint32_t* max_B_for_max_A_entry, int num_distribute = MAX_ELEMENTS_PER_THREAD_OUT*THREADS)
+	{
+		int* work_sum = smem.work_sum;
+		int* work_offsets = tempmem.work_offsets;
+
+		// clear work offsets
+#pragma unroll
+		for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i)
+			work_offsets[i*THREADS + threadIdx.x] = 0;
+
+		__syncthreads();
+
+		// compute which thread should start with a given work element
+#pragma unroll
+		for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i)
+		{
+			int v = work_sum[i*THREADS + threadIdx.x];
+			int vn = work_sum[i*THREADS + threadIdx.x + 1];
+			if (v < MAX_ELEMENTS_PER_THREAD_OUT*THREADS && v != vn)
+				work_offsets[v] = i*THREADS + threadIdx.x;
+		}
+
+		__syncthreads();
+
+		//compute max per thread elements
+		num_distribute = min(num_distribute, work_sum[THREADS*ELEMENTS_PER_THREAD_IN]);
+
+		// read my offset (can be the right offset or zero as only the first one will have the right per input element)
+#pragma unroll
+		for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i)
+		{
+			//if (MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i < num_distribute)
+			work_element_out[i] = work_offsets[MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i];
+			//else
+			//work_element_out[i] = 0;
+		}
+
+
+		SimpleScanT(sum_space).InclusiveScan(work_element_out, work_element_out, cub::Max());
+
+		int outElements = MAX_ELEMENTS_PER_THREAD_OUT;
+		if (!BLOCKOUT)
+		{
+
+			__syncthreads();
+
+			//stripped layout requires another trip through shared..
+#pragma unroll
+			for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i)
+				work_offsets[MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i] = work_element_out[i];
+
+			__syncthreads();
+
+			// run from back to front so we can just decrese the count iif elements cross thread boundaries (same as below, just with different indices)
+#pragma unroll
+			for (int i = MAX_ELEMENTS_PER_THREAD_OUT - 1; i >= 0; --i)
+			{
+				if (i*THREADS + threadIdx.x < num_distribute)
+				{
+					work_element_out[i] = work_offsets[threadIdx.x + i*THREADS];
+					int workoffset = (threadIdx.x + i*THREADS);
+					within_element_id[i] = work_sum[work_element_out[i] + 1] - workoffset - 1;
+
+					//TODO: needs adjustment for num_distribute
+					if (max_A_entry && (threadIdx.x == THREADS - 1) && (i == (MAX_ELEMENTS_PER_THREAD_OUT - 1)))
+					{
+						// Set max element in A and corresponding max element in B
+						*max_A_entry = work_element_out[i];
+						*max_B_for_max_A_entry = within_element_id[i];
+					}
+					//if ((within_element_id[i] < 0 && i + 1 < outElements) || (workoffset >= num_distribute))
+					//	outElements = i + 1;
+				}
+				else
+				{
+					outElements = i;
+					work_element_out[i] = -1;
+					within_element_id[i] = -1;
+				}
+			}
+		}
+		else
+		{
+			// run from back to front so we can just decrese the count iif elements cross thread boundaries
+#pragma unroll
+			for (int i = MAX_ELEMENTS_PER_THREAD_OUT - 1; i >= 0; --i)
+			{
+				int workoffset = (MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i);
+				within_element_id[i] = work_sum[work_element_out[i] + 1] - workoffset - 1;
+				if (workoffset >= num_distribute)
+					outElements = i;
+			}
+		}
+
+		__syncthreads();
+
+		// update counts
+#pragma unroll
+		for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i)
+		{
+			work_sum[threadIdx.x + i*THREADS + 1] = max(0, work_sum[threadIdx.x + i*THREADS + 1] - num_distribute);
+			//	printf("nwork: %d %d : %d\n", blockIdx.x, threadIdx.x + i*THREADS + 1, work_sum[threadIdx.x + i*THREADS + 1]);
+		}
+
+		__syncthreads();
+
+		return outElements;
+	}
+
+	__device__ __forceinline__
+	static int workAvailable(SharedMemT& smem)
+	{
+		//if (threadIdx.x == 0)
+		//	printf("%d work available: %d\n", blockIdx.x, smem.work_sum[ELEMENTS_PER_THREAD_IN*THREADS]);
+		return const_cast<volatile int*>(smem.work_sum)[ELEMENTS_PER_THREAD_IN*THREADS];
+	}
+	__device__ __forceinline__
+	static void removework(SharedMemT& smem, int amount)
+	{
+		#pragma unroll
+		for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i)
+		{
+			smem.work_sum[threadIdx.x + i*THREADS + 1] = max(0, smem.work_sum[threadIdx.x + i*THREADS + 1] - amount);
+		}
+	}
+};
\ No newline at end of file
diff --git a/include/GALATIC/include/device/acSpGEMM_ChunksToCSR.cuh b/include/GALATIC/include/device/acSpGEMM_ChunksToCSR.cuh
new file mode 100644
index 00000000..7a9d4e8c
--- /dev/null
+++ b/include/GALATIC/include/device/acSpGEMM_ChunksToCSR.cuh
@@ -0,0 +1,127 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * ChunkstoCSR.cuh
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include "MultiplyKernels.h"
+#include "Chunk.cuh"
+
+template< typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE>
+__global__ void copyChunks(void* const* __restrict chunks_pointers, const uint32_t* __restrict chunk_pointer_alloc, 
+	VALUE_TYPE * value_out, INDEX_TYPE * index_out, const OFFSET_TYPE* __restrict result_offets)
+{
+	using Chunk = ::Chunk<VALUE_TYPE, INDEX_TYPE>;
+
+	struct Smem
+	{
+		uint32_t chunksize;
+		uint32_t writeoffset;
+		const VALUE_TYPE* in_values;
+		const INDEX_TYPE* in_indices;
+	};
+
+	__shared__ Smem smem;
+
+	uint32_t counter = blockIdx.x;
+
+	while (counter < *chunk_pointer_alloc)
+	{
+		if(threadIdx.x == 0)
+		{
+			const Chunk* chunk = reinterpret_cast<const Chunk*>(chunks_pointers[counter]);
+			uint32_t chunksize = chunk->num_entries;
+			const VALUE_TYPE* in_values = chunk->values_direct(chunksize);
+			const INDEX_TYPE* in_indices = chunk->indices_direct(chunksize);
+			uint32_t firstrow = chunk->firstrow;
+
+			uint32_t startingOffset = chunk->startingoffset();
+			if(startingOffset == 0)
+			{
+				if (chunk->firstConsumed())
+				{
+					uint32_t firstoffset = chunk->firstCountCleared();
+					chunksize -= firstoffset;
+					in_values += firstoffset;
+					in_indices += firstoffset;
+					++firstrow;
+				}
+				if (chunk->lastConsumed() && !chunk->isDirect())
+					chunksize -= chunk->lastCountCleared();
+			}
+
+			smem.chunksize = chunksize;
+			smem.in_values = in_values;
+			smem.in_indices = in_indices;
+
+			//special case for multiple chunk rows (need offset for writing!)
+			smem.writeoffset = startingOffset + result_offets[firstrow];
+		}
+		__syncthreads();
+
+		//write out
+		for (uint32_t i = threadIdx.x; i < smem.chunksize; i += blockDim.x)
+		{
+			value_out[smem.writeoffset + i] = smem.in_values[i];
+			index_out[smem.writeoffset + i] = smem.in_indices[i];
+		}
+
+		counter += gridDim.x;
+	}
+
+}
+
+template<typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE>
+void AcSpGEMMKernels::h_copyChunks(void* const* __restrict chunks_pointers, const uint32_t* __restrict chunk_pointer_alloc, VALUE_TYPE * value_out, INDEX_TYPE * index_out, const uint32_t* __restrict result_offets)
+{
+	int blockSize(256);
+
+	static size_t copyBlocksOnGPU = 0;
+	if (copyBlocksOnGPU == 0)
+	{
+		CUdevice dev;
+		cudaGetDevice(&dev);
+		int occ, sm;
+		void(*ptr)(void* const* __restrict, const uint32_t* __restrict, VALUE_TYPE *, INDEX_TYPE * index_out, const uint32_t* __restrict) = copyChunks< VALUE_TYPE, INDEX_TYPE, OFFSET_TYPE>;
+		cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occ, ptr, blockSize, 0);
+		cudaDeviceGetAttribute(&sm, cudaDevAttrMultiProcessorCount, dev);
+		copyBlocksOnGPU = sm*occ;
+	}
+	copyChunks<VALUE_TYPE, INDEX_TYPE, OFFSET_TYPE> <<<copyBlocksOnGPU, blockSize >>>(chunks_pointers, chunk_pointer_alloc, value_out, index_out, result_offets);
+}
diff --git a/include/GALATIC/include/device/acSpGEMM_DetermineBlockStarts.cuh b/include/GALATIC/include/device/acSpGEMM_DetermineBlockStarts.cuh
new file mode 100644
index 00000000..5a185112
--- /dev/null
+++ b/include/GALATIC/include/device/acSpGEMM_DetermineBlockStarts.cuh
@@ -0,0 +1,113 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * DetermineBlockStarts.cuh
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include "MultiplyKernels.h"
+#include "../common.h"
+
+
+template<typename OFFSET_TYPE, uint32_t NNZ_PER_BLOCK>
+__global__ void DetermineBlockStarts(int num_other, const OFFSET_TYPE*__restrict offsets, uint32_t* startingIds, 
+	uint64_t* toClear, uint32_t* toClear1, uint32_t* toClear2, int num3, uint32_t* toClear3, int num4, uint32_t* toClear4,
+	int num5, uint32_t* toClear5, uint32_t* toClear6, uint32_t* toClear7, int num8, uint32_t* toClear8)
+{
+	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	if (id > num_other)
+		return;
+
+	int a = offsets[id];
+	int b = offsets[min(id + 1, num_other)];
+
+	int blocka = divup<int>(a, NNZ_PER_BLOCK);
+	int blockb = (b - 1) / static_cast<int>(NNZ_PER_BLOCK);
+
+	//iterate over all blocks that start with that row
+	for (; blocka <= blockb; ++blocka)
+		startingIds[blocka] = id;
+
+	//write last
+	if (id == num_other)
+		startingIds[divup<int>(b, NNZ_PER_BLOCK)] = id - 1;
+	else
+	{
+		toClear[id] = 0,
+		toClear1[id] = 0;
+	}	
+	toClear2[id] = 0;
+
+	for (int i = id; i < num3; i+=num_other)
+	{
+		toClear3[i] = 0;
+	}
+	
+	for (int i = id; i < num4; i += num_other)
+	{
+		toClear4[i] = 0;
+	}
+
+	for (int i = id; i < num5; i += num_other)
+	{
+		toClear5[i] = 0;
+		toClear6[i] = 0;
+		//toClear7[i] = 0;
+	}
+
+	for (int i = id; i < num8; i += num_other)
+	{
+		toClear8[i] = 0;
+	}
+}
+
+template<typename OFFSET_TYPE, uint32_t NNZ_PER_BLOCK>
+void AcSpGEMMKernels::h_DetermineBlockStarts(int num_other, const uint32_t*__restrict offsets, uint32_t* startingIds, uint64_t* toClear, uint32_t* toClear1, uint32_t* toClear2, int num3, uint32_t* toClear3, int num4, uint32_t* toClear4,
+	int num5, uint32_t* toClear5, uint32_t* toClear6, uint32_t* toClear7, int num8, uint32_t* toClear8)
+{
+	// This method has a tendency to access memory illegally
+	DetermineBlockStarts <OFFSET_TYPE, NNZ_PER_BLOCK> <<<gridDim, blockDim, 0 , stream>>>(num_other, offsets, startingIds, toClear, toClear1, toClear2, num3, toClear3,
+		num4, toClear4,
+		num5, toClear5, toClear6, toClear7, 
+		num8, toClear8);
+}
+
+
+#define GPUCompressedMatrixMatrixMultiplyBlockStarts(THREADS, NNZPERTHREAD) \
+	template void AcSpGEMMKernels::h_DetermineBlockStarts<uint32_t, THREADS*NNZPERTHREAD>(int num_other, const uint32_t*__restrict offsets, uint32_t* startingIds, uint64_t* toClear, uint32_t* toClear1, uint32_t* toClear2, int num3, uint32_t* toClear3, int num4, uint32_t* toClear4, int num5, uint32_t* toClear5, uint32_t* toClear6, uint32_t* toClear7, int num8, uint32_t* toClear8);
+
diff --git a/include/GALATIC/include/device/acSpGEMM_MergeGeneralized.cuh b/include/GALATIC/include/device/acSpGEMM_MergeGeneralized.cuh
new file mode 100644
index 00000000..dde2f78c
--- /dev/null
+++ b/include/GALATIC/include/device/acSpGEMM_MergeGeneralized.cuh
@@ -0,0 +1,738 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * MergeGeneralized.cuh
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include "MultiplyKernels.h"
+
+#define ELEMENT_TO_SEARCH 10198
+
+//binary search for an element in an array; returns the number of elements that are smaller or equal than the one
+// we are looking for
+template<typename DATA_TYPE, typename SIZE_TYPE>
+__device__ __forceinline__ SIZE_TYPE binarySearch(const DATA_TYPE* start, const SIZE_TYPE count, const DATA_TYPE target)
+{
+	if (count == 0)
+		return  0;
+
+	SIZE_TYPE lower_bound = 0;
+	SIZE_TYPE upper_bound = count - 1;
+	SIZE_TYPE tmp_loc;
+
+	if (target < start[lower_bound])
+		return 0;
+
+	if (target > start[count - 1])
+		return count;
+
+	while (lower_bound <= upper_bound)
+	{
+		tmp_loc = (lower_bound + upper_bound) >> 1;
+
+		if (target < start[tmp_loc])
+		{
+			upper_bound = tmp_loc - 1;
+		}
+		else if (target > start[tmp_loc])
+		{
+			lower_bound = tmp_loc + 1;
+		}
+		else
+		{
+			//we can have multiple target entries - let's skip them until we point after the last target
+			while (tmp_loc < count && start[tmp_loc] == target)
+				++tmp_loc;
+
+			return tmp_loc;
+		}
+	}
+
+	return lower_bound;	//element not found; return id of first element larger than target
+}
+
+// samples the interval [lower, upper] s.t. each of the num_samples sub intervals is approximately the same size
+template<typename INDEX_TYPE>
+__device__ __forceinline__ INDEX_TYPE getSample(INDEX_TYPE lower, INDEX_TYPE upper, uint32_t num_samples, uint32_t sample_point)
+{
+	float alpha = static_cast<float>(sample_point + 1) / num_samples;
+	return (1 - alpha) * lower + alpha * upper;
+}
+
+template<uint32_t THREADS>
+__device__ __forceinline__ uint32_t samplePosition(uint32_t minID, uint32_t maxID, int position = threadIdx.x + 1)
+{
+	return (divup<uint32_t>((maxID - minID), THREADS)) * (position);
+}
+
+template <typename INDEX_TYPE, uint32_t THREADS, uint32_t MERGE_MAX_CHUNKS>
+__device__ __forceinline__ uint32_t sampling(typename cub::BlockScan<uint32_t, THREADS>::TempStorage& atomicMaxScanTemp, 
+	INDEX_TYPE minID, INDEX_TYPE maxID, int numberChunks, uint32_t* max_sampling_category,
+	uint32_t (&sample_offsets)[THREADS], const INDEX_TYPE *__restrict__(&chunkIndices)[MERGE_MAX_CHUNKS], uint32_t (&chunkElementCount)[MERGE_MAX_CHUNKS])
+{
+	uint32_t sampling_step = divup<uint32_t>((maxID - minID), THREADS);
+	uint32_t my_sample_offset = 0;
+	for (auto round = 0; round < numberChunks; ++round)
+	{
+		// Reset intermediary offset
+		if (threadIdx.x == 0)
+			*max_sampling_category = 0;
+		sample_offsets[threadIdx.x] = 0;
+		__syncthreads();
+		uint32_t count = chunkElementCount[round];
+		for (int i = threadIdx.x; i < count - 1; i += THREADS)
+		{
+			// Fetch column Ids
+			INDEX_TYPE columnIndex = chunkIndices[round][i];
+			INDEX_TYPE nextColumnIndex = chunkIndices[round][i + 1];
+			INDEX_TYPE sampling_category = (columnIndex > 0) ? (columnIndex - 1) / sampling_step : 0;
+			INDEX_TYPE next_sampling_category = (nextColumnIndex - 1) / sampling_step;
+			if (sampling_category != next_sampling_category)
+			{
+				if (sampling_category < THREADS)
+					sample_offsets[sampling_category] = i + 1;
+				atomicMax(max_sampling_category, sampling_category);
+			}
+		}
+		__syncthreads();
+
+		// Set max
+		if (*max_sampling_category < (THREADS - 1))
+			sample_offsets[*max_sampling_category + 1] = count;
+		__syncthreads();
+
+		uint32_t sample_value[1] = { sample_offsets[threadIdx.x] };
+		// Propagate Max
+		cub::BlockScan<uint32_t, THREADS>(atomicMaxScanTemp).InclusiveScan(sample_value, sample_value, cub::Max());
+		__syncthreads();
+		// Write to global sample offsets
+		my_sample_offset += sample_value[0];
+		__syncthreads();
+	}
+	return my_sample_offset;
+}
+
+
+const int GlobalPathOffset = 0;
+const int MinColumnOffset = 1;
+const int MaxColumnOffset = 2;
+const int ElementsHandledOffset = 3;
+
+// #########################################################################################
+//
+//  Generalized Case
+//
+// #########################################################################################
+template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD, uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_CHUNKS, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE,  typename T, typename U, typename Label,
+        typename SEMIRING_t>
+__global__ void __launch_bounds__(THREADS, BLOCKS_PER_MP)
+mergeSharedRowsGeneralized(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads,
+	OFFSET_TYPE* output_row_count,
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled,
+	uint32_t* restart_sampleOffs, uint32_t* restart_chunkElementsConsumedAndPath, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring)
+{
+	static_assert(2 * INPUT_ELEMENTS_PER_THREAD * THREADS >= MERGE_MAX_CHUNKS, "Too many elements per column possible now!");
+
+	using Chunk = ::Chunk<typename SEMIRING_t::output_t, INDEX_TYPE>;
+
+    using DirectChunk = ::DirectChunk<typename SEMIRING_t::leftInput_t, typename SEMIRING_t::rightInput_t, INDEX_TYPE>;
+
+    const uint32_t ELEMENTS_PER_THREAD = 2 * INPUT_ELEMENTS_PER_THREAD;
+	using SortAndCombiner = SortAndCombine<uint32_t, typename SEMIRING_t::output_t, THREADS, ELEMENTS_PER_THREAD>;
+	using ScanCombinerEntry = typename SortAndCombiner::ScanCombinerEntry;
+	typedef cub::BlockScan<uint32_t, THREADS> SimpleScanT;
+	const uint32_t LengthSamplesPerThread = (MERGE_MAX_CHUNKS + THREADS - 1) / THREADS;
+	using SingleLoadWorkDistribution = WorkDistribution<THREADS, LengthSamplesPerThread>;
+	using IndexSorter = cub::BlockRadixSort<ChunkSortType, THREADS, LengthSamplesPerThread, uint32_t>;
+	
+	using LEFT_T = typename SEMIRING_t::leftInput_t;
+	using RIGHT_t = typename SEMIRING_t::rightInput_t;
+	using OUT_t = typename SEMIRING_t::output_t;
+
+	struct SMem
+	{
+		uint32_t runflag, restart/*, max_sampling_category*/;
+		uint32_t numSharedRow;
+		int numChunks;
+		INDEX_TYPE maxColumnIdRow, currentMinColumnIdRow, currentMaxColumnIdRow;
+		int sumOut;
+		uint32_t completed;
+		uint32_t longChunkOffset;
+		INDEX_TYPE globalPath;
+		INDEX_TYPE elementsHandled;
+
+
+
+		const INDEX_TYPE* __restrict chunkIndices[MERGE_MAX_CHUNKS];
+		Either<const RIGHT_t* ,  const OUT_t* >  chunkValues[MERGE_MAX_CHUNKS];
+		LEFT_T multiplier[MERGE_MAX_CHUNKS];
+		uint32_t chunkElementCount[MERGE_MAX_CHUNKS];
+		INDEX_TYPE sample_offsets[THREADS];
+		INDEX_TYPE elementsInChunkConsumed[MERGE_MAX_CHUNKS];
+		uint32_t current_path_elements[MERGE_MAX_CHUNKS];
+		
+		// Used for sorting
+		uint32_t indexing[MERGE_MAX_CHUNKS];
+
+		union {
+			struct
+			{
+				ChunkSortType sort_keys[MERGE_MAX_CHUNKS];
+				typename IndexSorter::TempStorage indexptrtempmem;
+			};
+			
+			struct {
+				typename SingleLoadWorkDistribution::SharedMemT single_workdistributionMem;
+				typename SingleLoadWorkDistribution::SharedTempMemT single_workdistributionTempMem;
+				typename SingleLoadWorkDistribution:: template SharedTempMemOutT<ELEMENTS_PER_THREAD>  single_workdistributionTempMemOutFull;
+			};
+
+			typename SortAndCombiner::SMem single_sAndCMem;
+
+			struct {
+                typename SEMIRING_t::output_t longOutDataBuffer[THREADS];
+				INDEX_TYPE longOutIndexBuffer[THREADS];
+			};
+		};
+	
+	};
+
+	__shared__ SMem smem;
+
+	//determine the block's offset
+	if (threadIdx.x == 0)
+	{
+		uint32_t shared_handled = shared_rows_handled[(blockIdx.x + restart_offset)];
+		smem.numSharedRow = 1 - shared_handled;
+		smem.runflag = *run_flag;
+		smem.restart = restart_completion[(blockIdx.x + restart_offset)];
+		smem.sumOut = (smem.restart > RESTART_FIRST_ITERATION) ? output_row_count[sharedRows[blockIdx.x]] : 0;
+	}
+	__syncthreads();
+
+	// Already handled
+	if (smem.numSharedRow == 0)
+		return;
+
+	__syncthreads();
+
+	if (threadIdx.x == 0)
+	{
+		//Get the one chunk that has elements of the block's row
+		uint64_t chunk = reinterpret_cast<uint64_t>(output_row_list_heads[sharedRows[blockIdx.x]]);
+		// DEBUG
+		// if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE)
+		// 	printf("Row %d in Generalized\n", sharedRows[blockIdx.x]);
+		// DEBUG
+		uint32_t chunk_counter = 0;
+
+		smem.currentMinColumnIdRow = std::numeric_limits<INDEX_TYPE>::max();
+		smem.maxColumnIdRow = 0;
+
+		//As long as we have some chunk that has elements of the block's row keep reading
+		while (chunk != 0)
+		{
+			INDEX_TYPE minColumnId, maxColumnId;
+			bool first_row = (chunk & 2) != 0;
+			//get a pointer to the current chunk
+			Chunk* __restrict pChunk = reinterpret_cast<Chunk*>(chunk & 0xFFFFFFFFFFFFFFFCULL);
+			uint32_t count;
+			const INDEX_TYPE* pIndices;
+			Either<const RIGHT_t*, const OUT_t*> pValues;
+			int32_t numentries = pChunk->num_entries;
+			LEFT_T multiplier;
+
+			smem.sort_keys[chunk_counter] = pChunk->sort_key;
+
+			if (first_row)
+			{
+				// only first_row chunks can be direct ones
+				if (pChunk->isDirect())
+				{
+					DirectChunk* __restrict pDirectChunk = reinterpret_cast<DirectChunk*>(pChunk);
+					count = numentries;
+					pIndices = pDirectChunk->indices_direct(numentries);
+					pValues = Either<const RIGHT_t*, const OUT_t*>::First(pDirectChunk->values_direct(numentries));
+					multiplier = pDirectChunk->getMultiplier();
+					chunk = reinterpret_cast<uint64_t>(pChunk->readNextFront());
+					pDirectChunk->setFirstConsumed();
+					minColumnId = pIndices[0];
+					maxColumnId = pIndices[count - 1];
+				}
+				else
+				{
+					count = pChunk->firstCountCleared();
+					pChunk->setFirstConsumed();
+					pIndices = pChunk->indices_direct(numentries);
+					pValues = Either<const RIGHT_t*, const OUT_t*>::Second(pChunk->values_direct(numentries));
+					minColumnId = pIndices[0];
+					maxColumnId = pIndices[count - 1];
+					chunk = reinterpret_cast<uint64_t>(pChunk->readNextFront());
+				}
+			}
+			else
+			{
+				count = pChunk->lastCountCleared();
+				pChunk->setLastConsumed();
+				uint32_t baseoffset = numentries - count;
+				pIndices = pChunk->indices_direct(numentries) + baseoffset;
+				pValues = Either<const RIGHT_t*, const OUT_t*>::Second(pChunk->values_direct(numentries) + baseoffset);
+				minColumnId = pIndices[0];
+				maxColumnId = pIndices[count - 1];
+				chunk = reinterpret_cast<uint64_t>(pChunk->readNextBack());
+			}
+
+			//Update global min/max column id
+			smem.currentMinColumnIdRow = min(smem.currentMinColumnIdRow, minColumnId);
+			smem.maxColumnIdRow = max(smem.maxColumnIdRow, maxColumnId);
+			smem.currentMaxColumnIdRow = smem.maxColumnIdRow;
+
+			// We do not have enough memory to store more chunk info
+			if (chunk_counter >= MERGE_MAX_CHUNKS)
+			{
+				printf("ERROR: number of chunks (%d) exceeds maximum (%d) in block: %u;\n", chunk_counter, MERGE_MAX_CHUNKS, blockIdx.x);
+				__trap();
+				smem.runflag = 1;
+				break;
+			}
+			else
+			{
+				smem.chunkIndices[chunk_counter] = pIndices;
+
+				smem.chunkValues[chunk_counter] = pValues;
+				smem.chunkElementCount[chunk_counter] = count;
+				smem.multiplier[chunk_counter] = multiplier;
+			}
+
+			++chunk_counter;
+		}
+		smem.numChunks = chunk_counter;
+	}
+	__syncthreads();
+
+	if (smem.runflag != 0)
+		return;
+
+	// Sort chunks
+	{
+		ChunkSortType key[LengthSamplesPerThread];
+		uint32_t value[LengthSamplesPerThread];
+		for (int i = threadIdx.x; i < MERGE_MAX_CHUNKS; i += THREADS)
+		{
+			value[i / THREADS] = i;
+#ifdef ENABLE_SORTING
+			if(i < smem.numChunks)
+				key[i/THREADS] = smem.sort_keys[i];
+			else
+				key[i / THREADS] = 0xFFFFFFFF;
+#endif
+		}
+#ifdef ENABLE_SORTING
+		IndexSorter(smem.indexptrtempmem).Sort(key, value);
+#endif
+		for (int i = threadIdx.x; i < MERGE_MAX_CHUNKS; i += THREADS)
+		{
+			smem.indexing[(threadIdx.x*LengthSamplesPerThread) + (i / THREADS)] = value[i / THREADS];
+			//smem.indexing[i] = i;
+		}
+	}
+	__syncthreads();
+
+	int chunkWorkElements[LengthSamplesPerThread];
+	//Perform the sampling
+	if (smem.restart < RESTART_FIRST_ITERATION)
+	{
+		//determine for each thread which column id he has to look for in the chunks
+		uint32_t sample = getSample(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, THREADS, threadIdx.x);
+
+		//warp based sampling in rounds; in round r thread i works on chunk (i+r) % n
+		INDEX_TYPE my_sample_offset = 0;
+		int wid = threadIdx.x / 32;
+		for (auto round = 0; round < smem.numChunks; ++round)
+		{
+			uint32_t count = smem.chunkElementCount[smem.indexing[(wid + round) % smem.numChunks]];
+			const INDEX_TYPE* pIndices = smem.chunkIndices[smem.indexing[(wid + round) % smem.numChunks]];
+			//perform binary search for sample in [pIndices, pIndices + count) and accumulate sample_locations
+			my_sample_offset += binarySearch(pIndices, count, sample);
+		}
+
+		//uint32_t my_sample_offset = sampling<INDEX_TYPE, THREADS, MERGE_MAX_CHUNKS>(smem.atomicMaxScanTemp, smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, smem.numChunks, &(smem.max_sampling_category), smem.sample_offsets, smem.chunkIndices, smem.chunkElementCount);
+
+		//write the threads sample offset to shared
+		smem.sample_offsets[threadIdx.x] = my_sample_offset;
+		restart_sampleOffs[blockIdx.x * THREADS + threadIdx.x] = my_sample_offset;
+		if (threadIdx.x == 0)
+		{
+			restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MinColumnOffset] = smem.currentMinColumnIdRow;
+			restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MaxColumnOffset] = smem.currentMaxColumnIdRow;
+		}
+	}
+	//We already restarted at least once and have done at least one iteration in the last run, hence, we have values that we want to reuse
+	else
+	{
+		smem.sample_offsets[threadIdx.x] = restart_sampleOffs[blockIdx.x * THREADS + threadIdx.x];
+		if (threadIdx.x == 0)
+		{
+			smem.currentMinColumnIdRow = restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MinColumnOffset];
+			smem.currentMaxColumnIdRow = restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MaxColumnOffset];
+		}
+	}
+
+	for (int i = threadIdx.x; i < MERGE_MAX_CHUNKS; i += THREADS)
+	{
+		smem.elementsInChunkConsumed[i] = restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + i];
+	}
+
+	__syncthreads();
+
+	if (threadIdx.x == 0)
+	{
+		smem.globalPath = restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + GlobalPathOffset];
+		smem.elementsHandled = restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + ElementsHandledOffset];
+		smem.restart = RESTART_FIRST_ITERATION;
+	}			
+
+	//we want to wait here s.t. e.g. smem.sample_offsets is available
+	__syncthreads();
+
+	bool sampling_required{ false };
+	while (true)
+	{
+		// Maybe resampling is required
+		if (sampling_required)
+		{
+			if (threadIdx.x == 0)
+			{
+				uint32_t minColumnIdRow = smem.currentMinColumnIdRow;
+
+				if (smem.globalPath > 0 && smem.globalPath != static_cast<INDEX_TYPE>(-1))
+				{
+					smem.currentMinColumnIdRow = getSample(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, THREADS, smem.globalPath - 1);
+					//smem.currentMinColumnIdRow = samplePosition<THREADS>(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, smem.globalPath - 1);
+				}
+
+				if (minColumnIdRow == smem.currentMinColumnIdRow && smem.globalPath != static_cast<INDEX_TYPE>(-1))
+				{
+					smem.currentMaxColumnIdRow = (smem.currentMinColumnIdRow + smem.currentMaxColumnIdRow) >> 1;	
+				}
+
+				smem.globalPath = 0;
+				restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MinColumnOffset] = smem.currentMinColumnIdRow;
+				restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MaxColumnOffset] = smem.currentMaxColumnIdRow;
+				restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + GlobalPathOffset] = smem.globalPath;
+			}
+			__syncthreads();
+			sampling_required = false;
+			
+			//determine for each thread which column id he has to look for in the chunks
+			uint32_t sample = getSample(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, THREADS, threadIdx.x);
+
+			//warp based sampling in rounds; in round r thread i works on chunk (i+r) % n
+			INDEX_TYPE my_sample_offset = 0;
+			int wid = threadIdx.x / 32;
+			for (auto round = 0; round < smem.numChunks; ++round)
+			{
+				uint32_t count = smem.chunkElementCount[smem.indexing[(wid + round) % smem.numChunks]];
+				const INDEX_TYPE* pIndices = smem.chunkIndices[smem.indexing[(wid + round) % smem.numChunks]];
+				//perform binary search for sample in [pIndices, pIndices + count) and accumulate sample_locations
+				my_sample_offset += binarySearch(pIndices, count, sample);
+			}
+			//write the threads sample offset to shared
+			smem.sample_offsets[threadIdx.x] = my_sample_offset;
+
+			//uint32_t my_sample_offset = sampling<INDEX_TYPE, THREADS, MERGE_MAX_CHUNKS>(smem.atomicMaxScanTemp, smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, smem.numChunks, &(smem.max_sampling_category), smem.sample_offsets, smem.chunkIndices, smem.chunkElementCount);
+			restart_sampleOffs[blockIdx.x * THREADS + threadIdx.x] = my_sample_offset;
+		}
+		__syncthreads();
+
+		//Decide where to perform the next cut; how many elements/columns do we want to handle now?
+		// after this the variables are updated to hold the new path [start sample id, end sample id)
+		bool path_boundary = false;
+		bool last_path = false;
+		//check whether we can handle all remaining columns now; this would be the last path
+		if (smem.sample_offsets[THREADS - 1] - smem.elementsHandled <= ELEMENTS_PER_THREAD * THREADS)
+		{
+			if (threadIdx.x == THREADS - 1)
+				last_path = true;
+		}
+		else
+		{
+			path_boundary = threadIdx.x >= smem.globalPath && threadIdx.x < THREADS - 1 &&
+				smem.sample_offsets[threadIdx.x] - smem.elementsHandled <= ELEMENTS_PER_THREAD * THREADS &&
+				smem.sample_offsets[threadIdx.x + 1] - smem.elementsHandled > ELEMENTS_PER_THREAD * THREADS &&
+				smem.sample_offsets[threadIdx.x] - smem.elementsHandled != 0;
+		}
+
+		// If no path can be chosen as any are too large to be handled -> resample
+		sampling_required = __syncthreads_and(!path_boundary && !last_path);
+		if (sampling_required)
+			continue;
+
+		//the thread with the id of the last column that should be handled updates the global path boundaries
+		if (path_boundary || last_path)
+		{
+			smem.globalPath = threadIdx.x + 1; //first sample id *not* in the current path
+			smem.completed = last_path;
+		}
+		__syncthreads(); 
+
+		//For each chunk: determine cutoff id using a binary search aka. determine local path
+		for(int i = 0; i < LengthSamplesPerThread; ++i)
+			chunkWorkElements[i] = 0;
+		for (int chunk = threadIdx.x; chunk < smem.numChunks; chunk += THREADS)
+		{
+			const INDEX_TYPE* pIndices = smem.chunkIndices[smem.indexing[chunk]];
+			uint32_t count = smem.chunkElementCount[smem.indexing[chunk]];
+			//how much of this chunk did we already consume? This is at the same time the start of the next local path;
+			const uint32_t prev_cutoff = smem.elementsInChunkConsumed[chunk];
+
+			//determine how many elements of this chunk are part of the current path
+			uint32_t look_for = getSample(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, THREADS, smem.globalPath - 1);
+			//uint32_t look_for = samplePosition<THREADS>(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, smem.globalPath - 1);
+
+			smem.current_path_elements[chunk] = (count > prev_cutoff) ? binarySearch(pIndices + prev_cutoff, count - prev_cutoff, look_for) : 0;
+			//update the number of consumed elements for each chunk
+			smem.elementsInChunkConsumed[chunk] += smem.current_path_elements[chunk];
+			//how many elements to handle in this chunk in the current path
+			chunkWorkElements[chunk / THREADS] = smem.current_path_elements[chunk];
+		}
+		__syncthreads();
+
+		SingleLoadWorkDistribution:: template initialize<false>(smem.single_workdistributionMem, smem.single_workdistributionTempMem, chunkWorkElements);
+
+		int chunk[ELEMENTS_PER_THREAD];
+		int element[ELEMENTS_PER_THREAD];
+
+		int elements = SingleLoadWorkDistribution:: template assignWorkAllThreads<false, ELEMENTS_PER_THREAD>(
+			smem.single_workdistributionMem, smem.single_workdistributionTempMem, smem.single_workdistributionTempMemOutFull,
+			chunk, element);
+
+		//combine entries of the current path in shared and write them into global
+		int numOut;
+		// Combine entries
+		ScanCombinerEntry combinedEntries[ELEMENTS_PER_THREAD];
+		{
+			uint32_t combIndex[ELEMENTS_PER_THREAD];
+			typename SEMIRING_t::output_t data[ELEMENTS_PER_THREAD];
+#pragma unroll
+			for (int i = 0; i < ELEMENTS_PER_THREAD; ++i)
+			{
+				if (element[i] >= 0)
+				{
+					const INDEX_TYPE* __restrict ip = smem.chunkIndices[smem.indexing[chunk[i]]] + smem.elementsInChunkConsumed[chunk[i]] - smem.current_path_elements[chunk[i]];
+					combIndex[i] = ip[element[i]];
+
+					if ( smem.chunkValues[smem.indexing[chunk[i]]].isFirst()) {
+						const RIGHT_t* dp = smem.chunkValues[smem.indexing[chunk[i]]].valFirst() + smem.elementsInChunkConsumed[chunk[i]] - smem.current_path_elements[chunk[i]];
+						data[i] = semiring.multiply(smem.multiplier[smem.indexing[chunk[i]]], dp[element[i]]);
+
+					} else {
+						const OUT_t* dp = smem.chunkValues[smem.indexing[chunk[i]]].valSecond() + smem.elementsInChunkConsumed[chunk[i]] - smem.current_path_elements[chunk[i]];
+						data[i] = dp[element[i]];
+					}					
+                }
+				else
+				{
+					data[i] = SEMIRING_t::AdditiveIdentity();
+					combIndex[i] = static_cast<INDEX_TYPE>(-1);
+				}
+			}
+			__syncthreads();
+
+			numOut = SortAndCombiner::combine(smem.single_sAndCMem, combIndex, data, combinedEntries,
+				[](auto a, auto b) {
+				return a == b;
+			},
+				[](auto a, auto b) {
+				return true;
+			}, semiring);
+			// ######## DEBUG
+			//if (numOut == 0 && threadIdx.x == 0)
+			//{
+			//	printf("%d %d oops in generalized\n", blockIdx.x, threadIdx.x);
+			//}
+			// ######## DEBUG
+		}
+
+		// create new chunk (could also reuse old ones if completely used up...?)
+		if (threadIdx.x == 0)
+		{
+			uint32_t chunkoff;
+			int ignored;
+			if (!allocChunk<typename SEMIRING_t::output_t, INDEX_TYPE>(numOut, chunk_alloc, chunk_size, chunkoff, ignored, false))
+			{
+				chunkoff = static_cast<uint32_t>(-1);
+				atomicOr(run_flag, 0x1);
+				// Write restart state
+				restart_completion[(blockIdx.x + restart_offset)] = smem.restart;
+			}
+			else
+			{
+				//need to add flag and offset for copy later (offset = s)
+				uint32_t s = smem.sumOut;
+				INDEX_TYPE actualrow = sharedRows[blockIdx.x];
+				//write chunk pointer
+				uint32_t chunk_pointer_position = atomicAdd(chunk_pointer_alloc, 1);
+				if (chunk_pointer_position >= chunk_pointer_sizes)
+				{
+					chunkoff = static_cast<uint32_t>(-1);
+					atomicOr(run_flag,0x2);
+					if (chunk_pointer_position == chunk_pointer_sizes)
+					{
+						*chunk_pointer_pos = chunk_pointer_sizes;
+					}
+					restart_completion[(blockIdx.x + restart_offset)] = smem.restart;
+				}
+				else
+				{
+					//FIXME SUSPICIOUS LINE april 25 
+					chunks_pointers[chunk_pointer_position] = reinterpret_cast<void*>(Chunk::place(chunks, chunkoff, numOut, actualrow, Chunk::StartingOffsetFlag | s, 0));
+					//write row count
+					s += numOut;
+					smem.sumOut = s;
+					output_row_count[actualrow] = s;
+				}				
+			}
+			smem.longChunkOffset = chunkoff;
+		}
+		__syncthreads();
+
+		if (smem.longChunkOffset == static_cast<uint32_t>(-1))
+		{
+			return;
+		}
+
+		//loop over data and write out
+		for (uint32_t written = 0; written < numOut; written += THREADS)
+		{
+			//store in shared for coalesced out
+#pragma unroll
+			for (int i = 0; i < ELEMENTS_PER_THREAD; ++i)
+			{
+				uint32_t poffset = combinedEntries[i].memoffset();
+				if (combinedEntries[i].isResult() &&
+					poffset >= written && poffset < written + THREADS)
+				{
+					uint32_t pwrite = poffset - written;
+
+					smem.longOutDataBuffer[pwrite] = combinedEntries[i].value;
+					smem.longOutIndexBuffer[pwrite] = combinedEntries[i].index;
+				}
+			}
+			__syncthreads();
+
+			//write outg
+			if (written + threadIdx.x < numOut)
+			{
+				typename SEMIRING_t::output_t* valstart = Chunk::cast(chunks, smem.longChunkOffset)->values_direct(numOut);
+				INDEX_TYPE* indexstart = Chunk::cast(chunks, smem.longChunkOffset)->indices_direct(numOut);
+
+				valstart[written + threadIdx.x] = smem.longOutDataBuffer[threadIdx.x];
+				indexstart[written + threadIdx.x] = smem.longOutIndexBuffer[threadIdx.x];
+			}
+			__syncthreads();
+		}
+		for (int i = threadIdx.x; i < MERGE_MAX_CHUNKS; i += THREADS)
+		{
+			restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + i] = smem.elementsInChunkConsumed[i];
+		}
+		if (threadIdx.x == 0)
+		{
+			smem.elementsHandled = smem.sample_offsets[smem.globalPath - 1]; //update path start (first sample id in the path) 
+			restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + GlobalPathOffset] = smem.globalPath;
+			restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + ElementsHandledOffset] = smem.elementsHandled;
+		}
+		__syncthreads();
+
+		// Work is done, we can stop now
+		if (smem.completed)
+		{
+			if(smem.currentMaxColumnIdRow == smem.maxColumnIdRow)
+				break;
+
+			__syncthreads();
+
+			if (threadIdx.x == 0)
+			{
+				smem.globalPath = static_cast<uint32_t>(-1);
+				smem.currentMinColumnIdRow = smem.currentMaxColumnIdRow + 1;
+				smem.currentMaxColumnIdRow = smem.maxColumnIdRow;
+			}
+			sampling_required = true;
+		}
+
+		smem.restart = RESTART_ITERATION_UNKNOWN;
+		__syncthreads();
+	}
+
+	// This row is done
+	if (threadIdx.x == 0)
+	{
+		shared_rows_handled[(blockIdx.x + restart_offset)] = 1;
+	}
+}
+
+template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD, uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_CHUNKS, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE,
+        typename T, typename U, typename Label,
+        typename SEMIRING_t>
+        void  AcSpGEMMKernels::h_mergeSharedRowsGeneralized(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads,
+	OFFSET_TYPE* output_row_count,
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled,
+	uint32_t* restart_sampleOffs, uint32_t* restart_chunkElementsConsumedAndPath, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring)
+{
+	mergeSharedRowsGeneralized<NNZ_PER_THREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS, VALUE_TYPE, INDEX_TYPE, OFFSET_TYPE, T, U,  Label,SEMIRING_t><<<gridDim, blockDim>>>(
+		blockOffsets, sharedRows, output_row_list_heads, output_row_count, chunks, chunk_alloc, chunk_pre_alloc, chunk_size,
+		chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, run_flag, restart_completion, shared_rows_handled,
+		restart_sampleOffs, restart_chunkElementsConsumedAndPath, restart_offset, chunk_pointer_pos, semiring);
+}
+
+
+#define GPUCompressedMatrixMatrixMultiplyMergeGeneralized(TYPE, THREADS, BLOCKS_PER_MP, NNZPERTHREAD, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS) \
+	template void AcSpGEMMKernels::h_mergeSharedRowsGeneralized<NNZPERTHREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS, TYPE, uint32_t, uint32_t> \
+	(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, \
+	uint32_t* output_row_count, \
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, \
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, \
+	uint32_t* restart_sampleOffs, uint32_t* restart_chunkElementsConsumedAndPath, uint32_t restart_offset, uint32_t* chunk_pointer_pos);
+
diff --git a/include/GALATIC/include/device/acSpGEMM_MergeMaxChunks.cuh b/include/GALATIC/include/device/acSpGEMM_MergeMaxChunks.cuh
new file mode 100644
index 00000000..305ad039
--- /dev/null
+++ b/include/GALATIC/include/device/acSpGEMM_MergeMaxChunks.cuh
@@ -0,0 +1,890 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * MergeMaxChunks.cuh
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+// Local includes
+#include "MultiplyKernels.h"
+#include "../meta_utils.h"
+#include <typeinfo>       // operator typeid
+
+
+#include <iostream>
+
+#define DIVISION_FACTOR 2
+
+
+
+
+
+// #########################################################################################
+// Resampling
+//
+	template <typename INDEX_TYPE, uint32_t MERGE_MAX_CHUNKS, uint32_t MERGE_MAX_PATH_OPTIONS>
+__device__ __forceinline__ void printSampling(const uint32_t* __restrict sharedRows, int numChunks, INDEX_TYPE (&id_samples)[MERGE_MAX_CHUNKS][MERGE_MAX_PATH_OPTIONS],
+	int row_index)
+{
+	if (sharedRows[blockIdx.x] == row_index && threadIdx.x == 0)
+	{
+		for (int i = 0; i < numChunks*MERGE_MAX_PATH_OPTIONS; ++i)
+		{
+			if (i % MERGE_MAX_PATH_OPTIONS == 0)
+				printf("\n");
+			printf("%u ", id_samples[i / MERGE_MAX_PATH_OPTIONS][i % MERGE_MAX_PATH_OPTIONS]);
+		}
+		printf("\n");
+	}
+}
+
+__device__ __forceinline__ void printInvalidPath(const uint32_t* __restrict sharedRows)
+{
+	if (threadIdx.x == 0)
+	{
+		printf("%u\n", sharedRows[blockIdx.x]);
+	}
+}
+
+__device__ __forceinline__ void printCountPerSampling(const uint32_t* __restrict sharedRows, uint32_t outputCount, uint32_t sampleID, uint32_t UpperBound, uint32_t row)
+{
+	if (outputCount < UpperBound && sharedRows[blockIdx.x] == row)
+	{
+		printf("Thread: %u  --  Outputcount: %u -- SampleID: %u\n", threadIdx.x, outputCount, sampleID);
+	}
+}
+
+
+
+
+
+// #########################################################################################
+//
+//  Max Chunks Case
+//
+// #########################################################################################
+template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD, uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_CHUNKS, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE,
+         typename T, typename U, typename Label,
+        typename SEMIRING_t>
+        __global__ void __launch_bounds__(THREADS, BLOCKS_PER_MP)
+mergeSharedRowsMaxChunks(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads,
+	OFFSET_TYPE* output_row_count,
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled,
+	INDEX_TYPE** restart_chunkIndices, Either<typename SEMIRING_t::rightInput_t*, typename SEMIRING_t::output_t*>* restart_chunkValues, typename SEMIRING_t::leftInput_t* restart_multiplier, uint32_t* restart_chunkElementCount, uint32_t restart_offset, uint32_t* restart_num_chunks, uint32_t* chunk_pointer_pos,
+	SEMIRING_t  semiring)
+{
+	using LEFT_T = typename SEMIRING_t::leftInput_t;
+	using RIGHT_t = typename SEMIRING_t::rightInput_t;
+
+	using OUT_t = typename SEMIRING_t::output_t;
+
+	using Chunk = ::Chunk<typename SEMIRING_t::output_t, INDEX_TYPE>;
+
+    using DirectChunk = ::DirectChunk<LEFT_T,RIGHT_t, INDEX_TYPE>;
+	
+	const uint32_t ELEMENTS_PER_THREAD = 2 * INPUT_ELEMENTS_PER_THREAD;
+	using SingleLoadWorkDistribution = WorkDistribution<THREADS>;
+	using SortAndCombiner = SortAndCombine<uint32_t, typename SEMIRING_t::output_t, THREADS, ELEMENTS_PER_THREAD>;
+	using ScanCombinerEntry = typename SortAndCombiner::ScanCombinerEntry;
+	const uint32_t PathEncodingBits = 32 - count_clz<MERGE_MAX_PATH_OPTIONS + 1>::value;
+	using PathEncoding = ChooseBitDataType<static_max<32,MERGE_MAX_CHUNKS * PathEncodingBits>::value>;
+
+
+	constexpr const uint32_t LengthSamplesPerThread = ((MERGE_MAX_PATH_OPTIONS + 1)*MERGE_MAX_CHUNKS + THREADS - 1) / THREADS;
+	
+	constexpr bool problem  = LengthSamplesPerThread >= 1;
+	static_assert(problem, "LengthSamplesPerThread must  be >= 1");
+
+	using SampleSorter = cub::BlockRadixSort<INDEX_TYPE, THREADS, LengthSamplesPerThread, ushort2>;
+	using PathMergeScan = cub::BlockScan<PathEncoding, THREADS>;
+	using IndexSorter = cub::BlockRadixSort<ChunkSortType, THREADS, 1, uint32_t>;
+
+	struct SMem
+	{
+
+    
+		uint32_t runflag, restart, halveStep;
+		uint32_t startSharedRow, numSharedRow;
+		int numChunks;
+		int sumOut;
+		uint32_t completed;
+		PathEncoding usePath;
+		union {
+			INDEX_TYPE useMaxId;
+			uint32_t remCounter;
+		};
+		uint32_t longChunkOffset;
+		const INDEX_TYPE* __restrict chunkIndices[MERGE_MAX_CHUNKS]; 
+		Either<const RIGHT_t* ,  const OUT_t* >  chunkValues[MERGE_MAX_CHUNKS]; //RL FIXME : add restrict back to internal pointer types?
+		T multiplier[MERGE_MAX_CHUNKS];
+		uint32_t chunkElementCount[MERGE_MAX_CHUNKS];
+		volatile uint32_t chunkTakeElements[MERGE_MAX_CHUNKS];
+
+		// Used for sorting
+		uint32_t indexing[MERGE_MAX_CHUNKS];
+
+		union {
+			struct
+			{
+				ChunkSortType sort_keys[MERGE_MAX_CHUNKS];
+				typename IndexSorter::TempStorage indexptrtempmem;
+			};
+			struct
+			{
+				union {
+					INDEX_TYPE id_samples[MERGE_MAX_CHUNKS][MERGE_MAX_PATH_OPTIONS];
+					struct {
+						typename SampleSorter::TempStorage sorterTempMem;
+						typename PathMergeScan::TempStorage pathmergeTempMem;
+					};
+					struct {
+						uint32_t downStreamCount[THREADS + 1];
+						INDEX_TYPE downStreamIndices[THREADS + 1];
+					};
+				};
+			};
+			struct {
+				typename SingleLoadWorkDistribution::SharedMemT single_workdistributionMem;
+				typename SingleLoadWorkDistribution::SharedTempMemT single_workdistributionTempMem;
+				typename SingleLoadWorkDistribution:: template SharedTempMemOutT<ELEMENTS_PER_THREAD>  single_workdistributionTempMemOutFull;
+			};
+			typename SortAndCombiner::SMem single_sAndCMem;
+			struct {
+				OUT_t longOutDataBuffer[THREADS];
+				INDEX_TYPE longOutIndexBuffer[THREADS];
+			};
+		};
+	};
+
+	__shared__ SMem smem;
+
+	//get my block's offset
+	if (threadIdx.x == 0)
+	{
+		uint32_t shared_handled = shared_rows_handled[blockIdx.x + restart_offset];
+		smem.numSharedRow = 1 - shared_handled;
+		smem.runflag = *run_flag;
+		smem.restart = restart_completion[blockIdx.x + restart_offset];
+		smem.sumOut = (smem.restart > RESTART_FIRST_ITERATION) ? output_row_count[sharedRows[blockIdx.x]] : 0;
+		smem.halveStep = 0;
+	}
+	__syncthreads();
+
+	if (smem.numSharedRow == 0)
+		return;
+
+
+
+	// Read in chunks (maximum MERGE_MAX_CHUNKS)
+	if (threadIdx.x == 0 && smem.restart < RESTART_FIRST_ITERATION)
+	{
+		uint64_t chunk = reinterpret_cast<uint64_t>(output_row_list_heads[sharedRows[blockIdx.x]]);
+		// if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE)
+		// 	printf("Row %d in MAX CHUNKS\n", sharedRows[blockIdx.x]);
+		uint32_t chunk_counter = 0;
+		uint32_t outsum = 0;
+
+		while (chunk != 0)
+		{
+			bool first_row = (chunk & 2) != 0;
+			Chunk* __restrict pChunk = reinterpret_cast<Chunk*>(chunk & 0xFFFFFFFFFFFFFFFCULL);
+			uint32_t count;
+			const INDEX_TYPE* pIndices;
+			Either<const RIGHT_t*, const OUT_t*> pValues;
+			int32_t numentries = pChunk->num_entries;
+			typename SEMIRING_t::leftInput_t multiplier;
+
+			smem.sort_keys[chunk_counter] = pChunk->sort_key;
+
+			if (first_row)
+			{
+				//only first rows can be direct
+				if (pChunk->isDirect())
+				{
+					DirectChunk* __restrict pDirectChunk = reinterpret_cast<DirectChunk*>(pChunk);
+					count = numentries;
+					pIndices = pDirectChunk->indices_direct(numentries);
+					pValues = Either<const RIGHT_t*, const OUT_t*>::First(pDirectChunk->values_direct(numentries));
+					multiplier = pDirectChunk->getMultiplier();
+					pDirectChunk->setFirstConsumed();
+					chunk = reinterpret_cast<uint64_t>(pChunk->readNextFront());
+				}
+				else
+				{
+					count = pChunk->firstCountCleared();
+					pChunk->setFirstConsumed();
+					pIndices = pChunk->indices_direct(numentries);
+					pValues =Either<const RIGHT_t*, const OUT_t*>::Second( pChunk->values_direct(numentries));
+					chunk = reinterpret_cast<uint64_t>(pChunk->readNextFront());
+				}
+			}
+			else
+			{
+				count = pChunk->lastCountCleared();
+				pChunk->setLastConsumed();
+				uint32_t baseoffset = numentries - count;
+				pIndices = pChunk->indices_direct(numentries) + baseoffset;
+				pValues = Either<const RIGHT_t*, const OUT_t*>::Second(pChunk->values_direct(numentries) + baseoffset);
+				chunk = reinterpret_cast<uint64_t>(pChunk->readNextBack());
+			}
+
+			if (chunk_counter >= MERGE_MAX_CHUNKS)
+			{
+				printf("%d %d too many chunks: %d %d : count is : %u and should not be more than: %u\n", blockIdx.x, threadIdx.x, chunk_counter + 1, outsum + count, output_row_count[sharedRows[blockIdx.x]], ELEMENTS_PER_THREAD *THREADS * (MERGE_MAX_CHUNKS - 1));
+				smem.runflag = 1;
+			}
+			else
+			{
+				smem.chunkIndices[chunk_counter] = pIndices;
+				smem.chunkValues[chunk_counter] = pValues;
+				smem.chunkElementCount[chunk_counter] = count;
+				smem.multiplier[chunk_counter] = multiplier;
+			}
+			// DEBUG
+			//if(sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE)
+			//	printf("Chunk %d : Count: %d Row: %u\n", chunk_counter, count, sharedRows[blockIdx.x]);
+			// DEBUG
+			outsum += count;
+			++chunk_counter;
+		}
+
+		smem.numChunks = min(chunk_counter, MERGE_MAX_CHUNKS);
+		smem.completed = (outsum < ELEMENTS_PER_THREAD*THREADS) ? 1 : 0;
+		if (smem.restart == RESTART_OFF)
+			restart_num_chunks[(blockIdx.x)] = smem.numChunks;
+	}
+	else if (threadIdx.x == 0)
+	{
+		smem.numChunks = restart_num_chunks[(blockIdx.x)];
+		smem.completed = 0;
+	}
+	__syncthreads();
+
+	if (smem.runflag != 0)
+		return;
+
+	// Sorting only if >= RESTART_FIRST_ITERATION
+	{
+		uint32_t value[1]{threadIdx.x};
+		if (smem.restart < RESTART_FIRST_ITERATION)
+		{
+			ChunkSortType key[1];
+
+			if (threadIdx.x < smem.numChunks)
+				key[0] = smem.sort_keys[threadIdx.x];
+			else
+				key[0] = 0xFFFFFFFF;
+#ifdef ENABLE_SORTING
+			IndexSorter(smem.indexptrtempmem).Sort(key, value);
+#endif
+		}
+
+		for (int i = threadIdx.x; i < MERGE_MAX_CHUNKS; i += THREADS)
+		{
+			smem.indexing[threadIdx.x] = value[0];
+		}
+	}
+	__syncthreads();
+
+	// If elements can't be held in temp, load samples (MERGE_MAX_PATH_OPTIONS per chunk)
+	if (!smem.completed)
+	{
+		if (smem.restart >= RESTART_FIRST_ITERATION)
+		{
+			// Load values from last restart
+			for (int wip = threadIdx.x / MERGE_MAX_PATH_OPTIONS; wip < smem.numChunks; wip += THREADS / MERGE_MAX_PATH_OPTIONS)
+			{
+				uint32_t lid = threadIdx.x % MERGE_MAX_PATH_OPTIONS;
+				if (lid == 0)
+				{
+					// Do not use indexing here as we write the chunks out in correct order
+					smem.chunkElementCount[wip] = restart_chunkElementCount[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip];
+					smem.chunkIndices[wip] = restart_chunkIndices[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip];
+					//fixme: RL bad practice.......
+					smem.chunkValues[wip] = *reinterpret_cast<Either<const RIGHT_t* ,  const OUT_t* >*> (&restart_chunkValues[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip]);
+					smem.multiplier[wip] = restart_multiplier[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip];
+				}
+			}
+			if (threadIdx.x == 0 && smem.restart == RESTART_ITERATION_FINISH)
+			{
+				// We want to finish in the next iteration
+				smem.completed = 1;
+			}
+		}
+		else
+		{
+			__syncthreads();
+			// We start our first iteration soon
+			if (threadIdx.x == 0)
+			{
+				smem.restart = RESTART_FIRST_ITERATION;
+			}
+		}
+		__syncthreads();
+
+		//load samples from each list for column offset (warp based in parallel)
+		for (int wip = threadIdx.x / MERGE_MAX_PATH_OPTIONS; wip < smem.numChunks; wip += THREADS / MERGE_MAX_PATH_OPTIONS)
+		{
+			uint32_t lid = threadIdx.x % MERGE_MAX_PATH_OPTIONS;
+			uint32_t count = smem.chunkElementCount[smem.indexing[wip]];
+			uint32_t step = (count + MERGE_MAX_PATH_OPTIONS - 1) / MERGE_MAX_PATH_OPTIONS;
+			uint32_t test = min(count - 1, step * lid);
+			INDEX_TYPE id = count > 0 ? smem.chunkIndices[smem.indexing[wip]][test] : 0xFFFFFFFF;
+			smem.id_samples[wip][lid] = id;
+		}
+	}
+	else if (threadIdx.x == 0)
+	{
+		// We are in the wrong case, remember that here
+		smem.restart = RESTART_WRONG_CASE;
+	}
+	__syncthreads();
+
+	// DEBUG
+	//printSampling<INDEX_TYPE, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS>(sharedRows, smem.numChunks, smem.id_samples, ROW_TO_INVESTIGATE);
+	// DEBUG
+
+	while (true)
+	{
+		int chunkWorkElements[1];
+		if (!smem.completed)
+		{
+			INDEX_TYPE mySampledIds[LengthSamplesPerThread];
+			ushort2 mySamplePayload[LengthSamplesPerThread];
+
+#pragma unroll
+			for (uint32_t i = 0; i < LengthSamplesPerThread; ++i)
+			{
+				uint32_t lid = i*THREADS + threadIdx.x;
+				uint32_t chunk = lid / (MERGE_MAX_PATH_OPTIONS + 1);
+				uint32_t sample = lid - chunk * (MERGE_MAX_PATH_OPTIONS + 1);
+				if (chunk < smem.numChunks)
+				{
+					mySampledIds[i] = sample == 0 ? 0 : smem.id_samples[chunk][sample - 1];
+					mySamplePayload[i] = make_ushort2(chunk, sample);
+				}
+				else
+				{
+					mySampledIds[i] = 0xFFFFFFFF;
+					mySamplePayload[i] = make_ushort2(MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS + 1);
+				}
+			}
+			__syncthreads();
+
+			//sort according to index
+			SampleSorter(smem.sorterTempMem).Sort(mySampledIds, mySamplePayload);
+
+			//construct bitmask
+			PathEncoding paths[LengthSamplesPerThread];
+#pragma unroll
+			for (uint32_t i = 0; i < LengthSamplesPerThread; ++i)
+				paths[i] = static_cast<PathEncoding>(mySamplePayload[i].y) << static_cast<PathEncoding>(mySamplePayload[i].x * PathEncodingBits);
+			//merge up
+			PathMergeScan(smem.pathmergeTempMem).InclusiveScan(paths, paths, PathMergerOp<MERGE_MAX_CHUNKS, PathEncodingBits>());
+
+			// reset and then compute output count
+			uint32_t outputCount[LengthSamplesPerThread];
+			for (uint32_t i = 0; i < LengthSamplesPerThread; ++i)
+				outputCount[i] = 0;
+
+			const PathEncoding Mask = (1 << PathEncodingBits) - 1;
+#pragma unroll
+			for (uint32_t chunk = 0; chunk < MERGE_MAX_CHUNKS; ++chunk)
+			{
+				if (chunk < smem.numChunks)
+				{
+					uint32_t count = smem.chunkElementCount[smem.indexing[chunk]];
+					uint32_t step = (count + MERGE_MAX_PATH_OPTIONS - 1) / MERGE_MAX_PATH_OPTIONS;
+#pragma unroll
+					for (uint32_t i = 0; i < LengthSamplesPerThread; ++i)
+					{
+						uint32_t chunkPath = static_cast<uint32_t>((paths[i] >> (PathEncodingBits * chunk)) & Mask);
+						outputCount[i] += min(count, step * chunkPath);
+					}
+				}
+			}
+			__syncthreads();
+
+			// ######## DEBUG
+			//printCountPerSampling(sharedRows, outputCount[0], mySampledIds[0], 2 * ELEMENTS_PER_THREAD*THREADS, ROW_TO_INVESTIGATE);
+			// ######## DEBUG
+
+			//publish so next can check it
+			smem.downStreamCount[THREADS] = 0xFFFFFFFF;
+			smem.downStreamIndices[THREADS] = 0;
+			smem.downStreamIndices[threadIdx.x] = mySampledIds[0];
+
+			smem.usePath = 0;
+			smem.useMaxId = 0;
+			__syncthreads();
+
+			// Propagate outputcount locally first such that first element per array is correct
+#pragma unroll
+			for (uint32_t i = LengthSamplesPerThread - 1; i > 0; --i)
+				if (mySampledIds[i - 1] == mySampledIds[i])
+					outputCount[i - 1] = outputCount[i];
+
+			smem.downStreamCount[threadIdx.x] = outputCount[0];
+			__syncthreads();
+
+			//propagate count over equal ids over arrays
+			bool prop = mySampledIds[0] == smem.downStreamIndices[threadIdx.x + 1] &&
+				mySampledIds[0] != 0xFFFFFFFF;
+			bool changed;
+			do
+			{
+				changed = prop && smem.downStreamCount[threadIdx.x + 1] != outputCount[0];
+				if (changed)
+					smem.downStreamCount[threadIdx.x] = outputCount[0] = smem.downStreamCount[threadIdx.x + 1];
+				changed = __syncthreads_or(changed);
+			} while (changed);
+
+			//propagate count locally again
+			if (mySampledIds[LengthSamplesPerThread - 1] == smem.downStreamIndices[threadIdx.x + 1])
+				outputCount[LengthSamplesPerThread - 1] = smem.downStreamCount[threadIdx.x + 1];
+#pragma unroll
+			for (uint32_t i = LengthSamplesPerThread - 1; i > 0; --i)
+				if (mySampledIds[i - 1] == mySampledIds[i])
+					outputCount[i - 1] = outputCount[i];
+
+			// ######## DEBUG
+			//printCountPerSampling(sharedRows, outputCount[0], mySampledIds[0], 2 * ELEMENTS_PER_THREAD*THREADS, ROW_TO_INVESTIGATE);
+			// ######## DEBUG
+
+			//find the first that goes over the threshold
+			if (outputCount[LengthSamplesPerThread - 1] <= ELEMENTS_PER_THREAD*THREADS && smem.downStreamCount[threadIdx.x + 1] > ELEMENTS_PER_THREAD*THREADS)
+			{
+				// ######## DEBUG
+				/*if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE)
+					printf("THREAD: %u Outputcount: %u Next Count %u | path: %llu maxid: %u\n", threadIdx.x, outputCount[LengthSamplesPerThread - 1], smem.downStreamCount[threadIdx.x + 1], paths[LengthSamplesPerThread - 1], smem.downStreamIndices[threadIdx.x + 1]);*/
+				// ######## DEBUG
+				smem.usePath = paths[LengthSamplesPerThread - 1];
+				smem.useMaxId = smem.downStreamIndices[threadIdx.x + 1];
+			}
+
+#pragma unroll
+			for (uint32_t i = 0; i < LengthSamplesPerThread - 1; ++i)
+			{
+				if (outputCount[i] <= ELEMENTS_PER_THREAD*THREADS && outputCount[i + 1] > ELEMENTS_PER_THREAD*THREADS)
+				{
+					smem.usePath = paths[i];
+					smem.useMaxId = mySampledIds[i + 1];
+				}
+			}
+
+			smem.completed = 1;
+			__syncthreads();
+
+			if (smem.usePath == 0)
+			{
+				//if (sharedRows[blockIdx.x] != ROW_TO_INVESTIGATE)
+				//	return;
+
+				// ######## DEBUG
+				/*if(sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE)
+					printInvalidPath(sharedRows);*/
+				// ######## DEBUG
+
+				if (threadIdx.x == 0)
+				{
+					smem.useMaxId = UINT32_MAX;
+					smem.halveStep = 1;
+					// ######## DEBUG
+					/*if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE)
+						printf("-----------------------------------------------------------------------------------\n");*/
+					// ######## DEBUG
+				}
+				__syncthreads();
+
+				// Go one half step -> get smallest ID
+				// -> all chunks should reach this with now at most half the workload
+				if (threadIdx.x < smem.numChunks)
+				{
+					uint32_t count = smem.chunkElementCount[smem.indexing[threadIdx.x]];
+					int step = ((count + (MERGE_MAX_PATH_OPTIONS * DIVISION_FACTOR) - 1) / (MERGE_MAX_PATH_OPTIONS * DIVISION_FACTOR));
+					if (count > 1)
+					{
+						INDEX_TYPE id = smem.chunkIndices[smem.indexing[threadIdx.x]][step];
+						// ######## DEBUG
+						//if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE)
+						//	printf("Chunk: %d with Count: %d - step: %d| Check out ID for chunk: %u\n", threadIdx.x, count, step, id);
+						// ######## DEBUG
+						atomicMin(&(smem.useMaxId), id);
+					}
+				}
+				__syncthreads();
+
+				// Select all chunks that are below this ID
+				if (threadIdx.x == 0)
+				{
+					// ######## DEBUG
+					/*if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE)
+						printf("MaxID chosen: %u\n", smem.useMaxId);*/
+					// ######## DEBUG
+					for (int i = 0; i < smem.numChunks; ++i)
+					{
+						if (smem.chunkElementCount[smem.indexing[i]] > 0 && smem.chunkIndices[smem.indexing[i]][0] < smem.useMaxId)
+						{
+							// Take these chunks -> for each chunk set the path to 1
+							smem.usePath |= static_cast<PathEncoding>(1) << static_cast<PathEncoding>(i * PathEncodingBits);
+						}
+					}
+				}
+				__syncthreads();
+			}
+			// ######################################################################################################################################################
+
+
+			//determine actual chunk ends to use
+			for (int wip = threadIdx.x / WARP_SIZE; wip < smem.numChunks; wip += THREADS / WARP_SIZE)
+			{
+				const PathEncoding PathCodingMask = (1 << PathEncodingBits) - 1;
+				int lpos = static_cast<int>((smem.usePath >> (wip*PathEncodingBits)) & PathCodingMask);
+				int count = smem.chunkElementCount[smem.indexing[wip]];
+				int step;
+				if (smem.halveStep)
+					step = ((count + (MERGE_MAX_PATH_OPTIONS * DIVISION_FACTOR) - 1) / (MERGE_MAX_PATH_OPTIONS * DIVISION_FACTOR));
+				else
+					step = (count + MERGE_MAX_PATH_OPTIONS - 1) / MERGE_MAX_PATH_OPTIONS;
+				int startpos = max(0, step * (lpos - 1));
+				int endpos = min(count, step * lpos);
+
+				smem.chunkTakeElements[wip] = endpos;
+				int current = endpos;
+
+				for (int i = startpos + laneid(); i < endpos; i += WARP_SIZE)
+				{
+					INDEX_TYPE next = static_cast<uint32_t>(-1);
+					if (i < count - 1)
+						next = smem.chunkIndices[smem.indexing[wip]][i + 1];
+					if (smem.chunkIndices[smem.indexing[wip]][i] < smem.useMaxId && smem.useMaxId <= next)
+						current = i + 1;
+				}
+
+				uint32_t found = __ballot_sync(0xFFFFFFFF, current != endpos);
+				if (found != 0)
+				{
+					current = __shfl_sync(0xFFFFFFF, current, __ffs(found) - 1);
+					smem.chunkTakeElements[wip] = current;
+				}
+
+				//not reduced to 0 -> set completed false
+				if (current != count)
+					smem.completed = 0;
+			}
+			__syncthreads();
+
+
+			chunkWorkElements[0] = 0;
+			if (threadIdx.x < smem.numChunks)
+			{
+				chunkWorkElements[0] = smem.chunkTakeElements[threadIdx.x];
+			}
+		}
+		else
+		{
+			//we can combine all at once!
+			chunkWorkElements[0] = 0;
+			if (threadIdx.x < smem.numChunks)
+				chunkWorkElements[0] = smem.chunkElementCount[smem.indexing[threadIdx.x]];
+		}
+
+		//use workdistribution to assign for loading
+		SingleLoadWorkDistribution:: template initialize<true>(smem.single_workdistributionMem, smem.single_workdistributionTempMem, chunkWorkElements);
+
+		int chunk[ELEMENTS_PER_THREAD];
+		int element[ELEMENTS_PER_THREAD];
+
+		int elements = SingleLoadWorkDistribution:: template assignWorkAllThreads<false, ELEMENTS_PER_THREAD>(
+			smem.single_workdistributionMem, smem.single_workdistributionTempMem, smem.single_workdistributionTempMemOutFull,
+			chunk, element);
+
+		// ######## DEBUG
+		if (threadIdx.x == 0 && elements == 0 /*&& sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE*/)
+		{
+			//printf("Row: %u got 0 elements with maxID: %u\n", sharedRows[blockIdx.x], smem.useMaxId);
+		}
+		// ######## DEBUG
+
+		int numOut;
+		// Combine entries
+		ScanCombinerEntry combinedEntries[ELEMENTS_PER_THREAD];
+		{
+			uint32_t combIndex[ELEMENTS_PER_THREAD];
+			typename SEMIRING_t::output_t data[ELEMENTS_PER_THREAD];
+#pragma unroll
+			for (int i = 0; i < ELEMENTS_PER_THREAD; ++i)
+			{
+				if (element[i] >= 0)
+				{
+					const INDEX_TYPE* __restrict ip = smem.chunkIndices[smem.indexing[chunk[i]]];
+					combIndex[i] = ip[element[i]];
+
+					const Either<const RIGHT_t* ,  const OUT_t* > dp = smem.chunkValues[smem.indexing[chunk[i]]];
+
+					if (dp.isFirst()) {
+						auto idx_ = element[i];
+						RIGHT_t right_ = dp.valFirst()[idx_];
+						auto idx_r_ = chunk[i];
+						auto idx_r_2_ = smem.indexing[idx_r_];
+						auto left_ =  smem.multiplier[idx_r_2_];
+						data[i] = semiring.multiply(left_ , right_);
+					} else {
+						auto idx_ = element[i];
+						data[i] =  dp.valSecond()[idx_];
+					}
+				}
+				else
+				{
+					data[i] = SEMIRING_t::AdditiveIdentity();
+					combIndex[i] = static_cast<uint32_t>(-1);
+				}
+			}
+			__syncthreads();
+
+			
+			auto & j  =smem.single_sAndCMem;
+
+			auto fo = 		[](auto a, auto b) {
+				return a == b;
+			};
+
+			auto bq = [](auto a, auto b) {
+				return true;
+			};
+
+			numOut = 2;
+			numOut =  SortAndCombiner::combine(j,
+				 combIndex, 
+				 data, 
+				 combinedEntries,
+				fo,bq
+				, semiring);
+
+
+			__syncthreads();
+			// ######## DEBUG
+			/*if (numOut == 0 && threadIdx.x == 0)
+			{
+				printf("%d %d oops in max chunks\n", blockIdx.x, threadIdx.x);
+			}*/
+			//if (numOut == 0)
+			//	return;
+			// ######## DEBUG
+		}
+
+		// create new chunk (could also reuse old ones if completely used up...?)
+		if (threadIdx.x == 0)
+		{
+			// Try to allocate chunk
+			uint32_t chunkoff;
+			int ignored;
+			// Update pre alloc before the actual allocation
+			if (!allocChunk<OUT_t, INDEX_TYPE>(numOut, chunk_alloc, chunk_size, chunkoff, ignored, false))
+			{
+				chunkoff = static_cast<uint32_t>(-1);
+				atomicOr(run_flag, 0x1);
+				// Write restart state
+				restart_completion[blockIdx.x + restart_offset] = smem.restart;
+			}
+			else
+			{
+				//need to add flag and offset for copy later (offset = s)
+				uint32_t s = smem.sumOut;
+				//write chunk header
+				INDEX_TYPE actualrow = sharedRows[blockIdx.x];
+				//write chunk pointer
+				uint32_t chunk_pointer_position = atomicAdd(chunk_pointer_alloc, 1);
+				if (chunk_pointer_position >= chunk_pointer_sizes)
+				{
+					chunkoff = static_cast<uint32_t>(-1);
+					atomicOr(run_flag,0x2);
+					if(chunk_pointer_position == chunk_pointer_sizes)
+						*chunk_pointer_pos = chunk_pointer_sizes;
+					// Write restart state
+					restart_completion[blockIdx.x + restart_offset] = smem.restart;
+				}
+				else
+				{
+					chunks_pointers[chunk_pointer_position] = reinterpret_cast<void*>(Chunk::place(chunks, chunkoff, numOut, actualrow, Chunk::StartingOffsetFlag | s, 0));
+					//write row count
+					s += numOut;
+					smem.sumOut = s;
+					output_row_count[actualrow] = s;
+				}
+			}
+			smem.longChunkOffset = chunkoff;
+		}
+
+		smem.remCounter = 0;
+		__syncthreads();
+
+		if (smem.longChunkOffset == static_cast<uint32_t>(-1))
+		{
+			// Write out current state and return
+			for (int wip = threadIdx.x / MERGE_MAX_PATH_OPTIONS; wip < smem.numChunks; wip += THREADS / MERGE_MAX_PATH_OPTIONS)
+			{
+				uint32_t lid = threadIdx.x % MERGE_MAX_PATH_OPTIONS;
+				if (lid == 0)
+				{
+					restart_chunkElementCount[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip] = smem.chunkElementCount[smem.indexing[wip]];
+					restart_multiplier[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip] = smem.multiplier[smem.indexing[wip]];
+					restart_chunkIndices[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip] = const_cast<INDEX_TYPE*>(smem.chunkIndices[smem.indexing[wip]]);
+					// FIXME: RL  - casting like this is a sin
+					restart_chunkValues[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip] = *reinterpret_cast<Either< RIGHT_t* ,   OUT_t* >*>(&smem.chunkValues[smem.indexing[wip]]);
+				}
+			}
+			return;
+		}
+
+		//loop over data and write out
+		for (uint32_t written = 0; written < numOut; written += THREADS)
+		{
+			//store in shared for coalesced out
+#pragma unroll
+			for (int i = 0; i < ELEMENTS_PER_THREAD; ++i)
+			{
+				uint32_t poffset = combinedEntries[i].memoffset();
+				if (combinedEntries[i].isResult() &&
+					poffset >= written && poffset < written + THREADS)
+				{
+					uint32_t pwrite = poffset - written;
+					smem.longOutDataBuffer[pwrite] = combinedEntries[i].value;
+					smem.longOutIndexBuffer[pwrite] = combinedEntries[i].index;
+				}
+			}
+			__syncthreads();
+
+			//write out
+			if (written + threadIdx.x < numOut)
+			{
+				typename SEMIRING_t::output_t* valstart = Chunk::cast(chunks, smem.longChunkOffset)->values_direct(numOut);
+				INDEX_TYPE* indexstart = Chunk::cast(chunks, smem.longChunkOffset)->indices_direct(numOut);
+
+				valstart[written + threadIdx.x] = smem.longOutDataBuffer[threadIdx.x];
+				indexstart[written + threadIdx.x] = smem.longOutIndexBuffer[threadIdx.x];
+			}
+			__syncthreads();
+		}
+
+		// Work is done, we can stop now
+		if (smem.completed)
+			break;
+
+		//reduce all counts and adjust pointers
+		for (int wip = threadIdx.x / MERGE_MAX_PATH_OPTIONS; wip < smem.numChunks; wip += THREADS / MERGE_MAX_PATH_OPTIONS)
+		{
+			uint32_t lid = threadIdx.x % MERGE_MAX_PATH_OPTIONS;
+			uint32_t count = smem.chunkElementCount[smem.indexing[wip]];
+			uint32_t rem = smem.chunkTakeElements[wip];
+
+			uint32_t newcount = count - rem;
+			smem.chunkElementCount[smem.indexing[wip]] = newcount;
+			const INDEX_TYPE* __restrict newchunkIndices = smem.chunkIndices[smem.indexing[wip]] + rem;
+			smem.chunkIndices[smem.indexing[wip]] = newchunkIndices;
+			Either<const RIGHT_t* ,  const OUT_t* >   newchunkValues; //fixme RL  :  add restrict on interior types?
+
+			if (smem.chunkValues[smem.indexing[wip]].isFirst()) {
+				newchunkValues = Either<const RIGHT_t* ,  const OUT_t* >::First(smem.chunkValues[smem.indexing[wip]].valFirst() + rem);
+			} else {
+				newchunkValues = Either<const RIGHT_t* ,  const OUT_t* >::Second(smem.chunkValues[smem.indexing[wip]].valSecond() + rem);
+			}
+			
+			smem.chunkValues[smem.indexing[wip]] = newchunkValues;
+
+			uint32_t step = (newcount + MERGE_MAX_PATH_OPTIONS - 1) / MERGE_MAX_PATH_OPTIONS;
+			uint32_t test = min(newcount - 1, step * lid);
+			INDEX_TYPE id =  newcount > 0 ? newchunkIndices[test] : 0xFFFFFFFF;
+			smem.id_samples[wip][lid] = id;
+
+			if (lid == 0)
+				atomicAdd(&smem.remCounter, newcount);
+		}
+		__syncthreads();
+
+		// ######## DEBUG
+		//printSampling<INDEX_TYPE, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS>(sharedRows, smem.numChunks, smem.id_samples, ROW_TO_INVESTIGATE);
+		// ######## DEBUG
+
+		smem.completed = smem.remCounter < ELEMENTS_PER_THREAD*THREADS ? 1 : 0;
+		if (threadIdx.x == 0)
+		{
+			smem.restart = smem.completed ? RESTART_ITERATION_FINISH : RESTART_ITERATION_UNKNOWN;
+			smem.halveStep = 0;
+		}
+		__syncthreads();
+	}
+
+	// This row is done
+	if (threadIdx.x == 0)
+	{
+		shared_rows_handled[blockIdx.x + restart_offset] = 1;
+	}
+
+	return;
+}
+
+
+template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD, uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_CHUNKS, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE,
+        typename T, typename U, typename Label,
+        typename SEMIRING_t>
+        	        void AcSpGEMMKernels::h_mergeSharedRowsMaxChunks(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads,
+	OFFSET_TYPE* output_row_count, uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled,
+	INDEX_TYPE** restart_chunkIndices, Either<typename SEMIRING_t::rightInput_t*, typename SEMIRING_t::output_t*>* restart_chunkValues, typename SEMIRING_t::leftInput_t* restart_multiplier, uint32_t* restart_chunkElementCount, uint32_t restart_offset, uint32_t* restart_num_chunks, uint32_t* chunk_pointer_pos, SEMIRING_t semiring)
+{
+
+	mergeSharedRowsMaxChunks<NNZ_PER_THREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS, VALUE_TYPE, INDEX_TYPE, OFFSET_TYPE,   T,  U,  Label,
+             SEMIRING_t><<<gridDim, blockDim>>>(
+		blockOffsets, sharedRows, output_row_list_heads, output_row_count, chunks, chunk_alloc, chunk_pre_alloc, chunk_size,
+		chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, run_flag, restart_completion, shared_rows_handled,
+		restart_chunkIndices, restart_chunkValues, restart_multiplier, restart_chunkElementCount, restart_offset, restart_num_chunks, chunk_pointer_pos, semiring);
+}
+
+
+#define GPUCompressedMatrixMatrixMultiplyMergeMaxChunks(TYPE, THREADS, BLOCKS_PER_MP, NNZPERTHREAD, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS) \
+	template void AcSpGEMMKernels::h_mergeSharedRowsMaxChunks<NNZPERTHREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS, TYPE, uint32_t, uint32_t> \
+	(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, \
+	uint32_t* output_row_count, \
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, \
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, \
+	uint32_t** restart_chunkIndices, TYPE** restart_chunkValues, TYPE* restart_multiplier, uint32_t* restart_chunkElementCountDataOffset2, uint32_t restart_offset, uint32_t* restart_num_chunks, uint32_t* chunk_pointer_pos);
diff --git a/include/GALATIC/include/device/acSpGEMM_MergeSimple.cuh b/include/GALATIC/include/device/acSpGEMM_MergeSimple.cuh
new file mode 100644
index 00000000..09bb0d02
--- /dev/null
+++ b/include/GALATIC/include/device/acSpGEMM_MergeSimple.cuh
@@ -0,0 +1,393 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * MergeSimple.cuh
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include "MultiplyKernels.h"
+
+// #########################################################################################
+//
+//  Simple Case
+//
+// #########################################################################################
+template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD, uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_CHUNKS, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE, bool LONG_SORT,
+        typename T, typename U, typename Label,
+        typename SEMIRING_t>
+__global__ void __launch_bounds__(THREADS, BLOCKS_PER_MP)
+mergeSharedRowsSimple(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads,
+	OFFSET_TYPE* output_row_count,
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring)
+{
+	using Chunk = ::Chunk<typename SEMIRING_t::output_t, INDEX_TYPE>;
+	const uint32_t ELEMENTS_PER_THREAD = 2 * INPUT_ELEMENTS_PER_THREAD;
+	using SortType = ChooseBitDataType<LONG_SORT ? 64 : 32>;
+	const uint32_t SharedRowsShift = LONG_SORT ? 32 : count_clz<THREADS-1>::value;
+	const uint32_t SharedRowsBits = 32 - count_clz<THREADS-1>::value;
+	const SortType SharedRowsColMask = (SortType(1) << SharedRowsShift) - 1;
+	const SortType SharedRowsMaskShifted = ~SharedRowsColMask;
+	using LoadWorkDistribution = WorkDistribution<THREADS, 2>;
+	using SortAndCombiner = SortAndCombine<SortType, typename SEMIRING_t::output_t, THREADS, ELEMENTS_PER_THREAD>;
+	using ScanCombinerEntry = typename SortAndCombiner::ScanCombinerEntry;
+
+	struct SMem
+	{
+    
+		uint32_t runflag, chunk_pointer_position;
+		uint32_t startSharedRow, numSharedRow;
+		INDEX_TYPE minColumnId[THREADS];
+
+		union
+		{
+			struct
+			{
+				const typename SEMIRING_t::output_t* dataPointer[2 * THREADS];
+				union
+				{
+					ushort2 fromDataOffset[THREADS];
+					uint16_t dataToIndexOffset[2 * THREADS];
+				};
+				struct {
+					typename LoadWorkDistribution::SharedMemT workdistributionMem;
+					typename LoadWorkDistribution::SharedTempMemT workdistributionTempMem;
+					typename LoadWorkDistribution:: template SharedTempMemOutT<ELEMENTS_PER_THREAD> workdistributionTempMemOutFull;
+				};
+			};
+
+			typename SortAndCombiner::SMem sAndCMem;
+
+			struct
+			{
+				typename SEMIRING_t::output_t outDataBuffer[THREADS];
+				INDEX_TYPE outIndexBuffer[THREADS];
+				ushort2 outRowIdRowOffsetBuffer[THREADS];
+				uint32_t outRowCounts[THREADS];
+				uint32_t outChunkOffset[THREADS];
+			};
+		};
+	};
+
+	__shared__ SMem smem;
+
+	//get my block's offset
+	if (threadIdx.x == 0)
+	{
+		uint32_t bstart = blockOffsets[blockIdx.x];
+		uint32_t shared_handled = shared_rows_handled[blockIdx.x + restart_offset];
+		smem.startSharedRow = bstart + shared_handled;
+		smem.numSharedRow = blockOffsets[blockIdx.x + 1] - (bstart + shared_handled);
+		smem.runflag = *run_flag;
+	}
+
+	__syncthreads();
+
+	if (smem.numSharedRow == 0)
+		return;
+
+	int count[2] = { 0, 0 };
+	
+	//load all chunk information
+	if (threadIdx.x < smem.numSharedRow)
+	{
+		uint32_t idoffset[2] = { 0, 0 };
+		uint32_t access_index[2] = { 0, 1 };
+		uint64_t chunk = reinterpret_cast<uint64_t>(output_row_list_heads[sharedRows[smem.startSharedRow + threadIdx.x]]);
+		// if (sharedRows[smem.startSharedRow + threadIdx.x] == ROW_TO_INVESTIGATE)
+		// 	printf("Row %d in SIMPLE\n", sharedRows[smem.startSharedRow + threadIdx.x]);
+		bool first_row = (chunk & 2) != 0;
+		Chunk* __restrict pChunk = reinterpret_cast<Chunk*>(chunk & 0xFFFFFFFFFFFFFFFCULL);
+		Chunk* __restrict second;
+		if (first_row)
+		{
+			second = pChunk->readNextFront();
+		}
+		else
+		{
+			second = pChunk->readNextBack();
+		}
+		bool first_row2 = (reinterpret_cast<uint64_t>(second) & 2) != 0;
+		second = reinterpret_cast<Chunk*>(reinterpret_cast<uint64_t>(second) & 0xFFFFFFFFFFFFFFFCULL);
+
+#ifdef ENABLE_SORTING
+		if (second->sort_key < pChunk->sort_key)
+		{
+			// Reverse access order
+			access_index[0] = 1;
+			access_index[1] = 0;
+		}
+#endif
+
+		INDEX_TYPE minColumnId;
+		
+		const typename SEMIRING_t::output_t* pdata;
+		idoffset[0] = pChunk->num_entries;
+		if (first_row)
+		{
+			count[access_index[0]] = pChunk->firstCountCleared();
+			pdata = pChunk->values_direct(idoffset[0]);
+			minColumnId = pChunk->indices_direct(idoffset[0])[0];
+			idoffset[0] = idoffset[0] * sizeof(typename SEMIRING_t::output_t);
+			pChunk->setFirstConsumed();
+		}
+		else
+		{
+			count[access_index[0]] = pChunk->lastCountCleared();
+			uint32_t baseoffset = idoffset[0] - count[access_index[0]];
+			pdata = pChunk->values_direct(idoffset[0]) + baseoffset;
+			minColumnId = pChunk->indices_direct(idoffset[0])[baseoffset];
+			idoffset[0] = count[access_index[0]] * sizeof(typename SEMIRING_t::output_t) + baseoffset * sizeof(INDEX_TYPE);
+			pChunk->setLastConsumed();
+		}
+
+		smem.dataPointer[2 * threadIdx.x + access_index[0]] = pdata;		
+
+		idoffset[1] = second->num_entries;
+		//we dont need to figure out whether the second pointer is front or back, as front follows back and vice versa
+		if (first_row2)
+		{
+			count[access_index[1]] = second->firstCountCleared();
+			minColumnId = min(minColumnId, second->indices_direct(idoffset[1])[0]);
+			pdata = second->values_direct(idoffset[1]);
+			idoffset[1] = idoffset[1] * sizeof(typename SEMIRING_t::output_t);
+			second->setFirstConsumed();
+		}
+		else
+		{
+			count[access_index[1]] = second->lastCountCleared();
+			uint32_t baseoffset = idoffset[1] - count[access_index[1]];
+			minColumnId = min(minColumnId, second->indices_direct(idoffset[1])[baseoffset]);
+			pdata = second->values_direct(idoffset[1]) + baseoffset;
+			idoffset[1] = count[access_index[1]] * sizeof(typename SEMIRING_t::output_t) + baseoffset * sizeof(INDEX_TYPE);
+			second->setLastConsumed();
+		}
+
+		smem.dataPointer[2 * threadIdx.x + access_index[1]] = pdata;
+		smem.fromDataOffset[threadIdx.x] = make_ushort2(idoffset[access_index[0]], idoffset[access_index[1]]);
+		smem.minColumnId[threadIdx.x] = minColumnId;
+	}
+
+	//use workdistribution to assign for loading
+	LoadWorkDistribution::template initialize<true>(smem.workdistributionMem, smem.workdistributionTempMem, count);
+
+	int rowPair[ELEMENTS_PER_THREAD];
+	int element[ELEMENTS_PER_THREAD];
+
+	int elements = LoadWorkDistribution:: template assignWorkAllThreads<false, ELEMENTS_PER_THREAD>(
+		smem.workdistributionMem, smem.workdistributionTempMem, smem.workdistributionTempMemOutFull,
+		rowPair, element);
+
+	int numOut;
+	ScanCombinerEntry combinedEntries[ELEMENTS_PER_THREAD];
+	{
+		SortType combIndex[ELEMENTS_PER_THREAD];
+		typename SEMIRING_t::output_t data[ELEMENTS_PER_THREAD];
+#pragma unroll
+		for (int i = 0; i < ELEMENTS_PER_THREAD; ++i)
+		{
+			if (element[i] >= 0)
+			{
+				const typename SEMIRING_t::output_t* dp = smem.dataPointer[rowPair[i]];
+				const INDEX_TYPE* colptr = reinterpret_cast<const INDEX_TYPE*>(reinterpret_cast<const char*>(dp) + smem.dataToIndexOffset[rowPair[i]]);
+				INDEX_TYPE colid = colptr[element[i]];
+				data[i] = dp[element[i]];
+				uint32_t rowId = rowPair[i] / 2;
+				SortType redcolid = colid - smem.minColumnId[rowId];
+				/*if (redcolid >= (SortType(1) << SharedRowsShift))
+					printf("data mix up happening: %d >= %d (shift %d, off %d)!\n", redcolid, 1 << SharedRowsShift, SharedRowsShift, smem.minColumnId[rowId]);*/
+				combIndex[i] = (static_cast<SortType>(rowId) << SharedRowsShift) | redcolid;
+			}
+			else
+			{
+				data[i] = SEMIRING_t::AdditiveIdentity();
+				combIndex[i] = static_cast<SortType>(-1);
+			}
+		}
+
+		__syncthreads();
+
+		numOut = SortAndCombiner::combine(smem.sAndCMem, combIndex, data, combinedEntries,
+			[](auto a, auto b) {
+			return a == b;
+		},
+			[SharedRowsMaskShifted](auto a, auto b) {
+			return (a & SharedRowsMaskShifted) == (b & SharedRowsMaskShifted);
+		}, semiring, LONG_SORT ? (32 + SharedRowsBits + 1) : 32);
+	}
+
+	__syncthreads();
+
+	//write count for rows
+	for (int i = 0; i < ELEMENTS_PER_THREAD; ++i)
+	{
+		if (combinedEntries[i].isRowend())
+		{
+			uint32_t row = combinedEntries[i].index >> SharedRowsShift;
+			uint32_t rcount = combinedEntries[i].rowcount();
+			smem.outRowCounts[row] = rcount;
+		}
+	}
+
+	__syncthreads();
+
+	// Let's see if we can go ahead
+	if (threadIdx.x < smem.numSharedRow)
+	{
+		uint32_t chunkoff = 0xFFFFFFFF;
+		int ignored;
+		uint32_t elcount = smem.outRowCounts[threadIdx.x];
+		if (!allocChunk<typename SEMIRING_t::output_t, INDEX_TYPE>(elcount, chunk_alloc, chunk_size, chunkoff, ignored, false))
+		{
+			// We have to restart for this block at this point, set run_flag and remember how many rows are left
+			atomicOr(run_flag, 0x1);
+			smem.runflag = 1;
+		}
+		else
+		{
+			smem.outChunkOffset[threadIdx.x] = chunkoff;
+		}
+	}
+	__syncthreads();
+	if (smem.runflag != 0)
+	{
+		return;
+	}
+
+	if (threadIdx.x == 0)
+	{
+		smem.chunk_pointer_position = atomicAdd(chunk_pointer_alloc, smem.numSharedRow);
+		if (smem.chunk_pointer_position + smem.numSharedRow > chunk_pointer_sizes)
+		{
+			atomicOr(run_flag, 0x2);
+			smem.runflag = 1;
+			if (smem.chunk_pointer_position <= chunk_pointer_sizes)
+				*chunk_pointer_pos = smem.chunk_pointer_position;
+		}
+	}
+	__syncthreads();
+	if (smem.runflag != 0)
+	{
+		return;
+	}
+		
+	// Allocate chunk for each row and update count in global
+	if (threadIdx.x < smem.numSharedRow)
+	{
+		uint32_t elcount = smem.outRowCounts[threadIdx.x];
+		INDEX_TYPE actualrow = sharedRows[smem.startSharedRow + threadIdx.x];
+		//write chunk pointer
+		chunks_pointers[smem.chunk_pointer_position + threadIdx.x] = reinterpret_cast<void*>(Chunk::place(chunks, smem.outChunkOffset[threadIdx.x], elcount, actualrow, 0, 0));
+		//write row count
+		output_row_count[actualrow] = elcount;	
+	}
+
+	//loop over data and write out
+	for (uint32_t written = 0; written < numOut; written += THREADS)
+	{
+		//store in shared for coalesced out
+#pragma unroll
+		for (int i = 0; i < ELEMENTS_PER_THREAD; ++i)
+		{
+			uint32_t poffset = combinedEntries[i].memoffset();
+			if (combinedEntries[i].isResult() &&
+				poffset >= written && poffset < written + THREADS)
+			{
+				uint32_t pwrite = poffset - written;
+				uint32_t row = combinedEntries[i].index >> SharedRowsShift;
+				smem.outDataBuffer[pwrite] = combinedEntries[i].value;
+				smem.outIndexBuffer[pwrite] = static_cast<INDEX_TYPE>(combinedEntries[i].index & SharedRowsColMask) + smem.minColumnId[row];
+				smem.outRowIdRowOffsetBuffer[pwrite] = make_ushort2(row, combinedEntries[i].rowcount() - 1);
+			}
+		}
+		__syncthreads();
+
+		//write out
+		if (written + threadIdx.x < numOut)
+		{
+			ushort2 row_offset = smem.outRowIdRowOffsetBuffer[threadIdx.x];
+			uint32_t chunkoffset = smem.outChunkOffset[row_offset.x];
+			if (chunkoffset != 0xFFFFFFFF)
+			{
+				uint32_t count = smem.outRowCounts[row_offset.x];
+				typename SEMIRING_t::output_t* valstart = Chunk::cast(chunks, chunkoffset)->values_direct(count);
+				INDEX_TYPE* indexstart = Chunk::cast(chunks, chunkoffset)->indices_direct(count);
+				valstart[row_offset.y] = smem.outDataBuffer[threadIdx.x];
+				indexstart[row_offset.y] = smem.outIndexBuffer[threadIdx.x];
+			}
+		}
+		__syncthreads();
+	}
+
+	// Indicator for restart
+	if (threadIdx.x == 0)
+		shared_rows_handled[blockIdx.x + restart_offset] += smem.numSharedRow;
+
+	return;
+}
+
+
+template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD, uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_CHUNKS, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE, bool LONG_SORT,
+        typename T, typename U, typename Label,
+        typename SEMIRING_t>
+        void AcSpGEMMKernels::h_mergeSharedRowsSimple(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads,
+	OFFSET_TYPE* output_row_count,
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring)
+{
+	mergeSharedRowsSimple<NNZ_PER_THREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS, VALUE_TYPE, INDEX_TYPE, OFFSET_TYPE, LONG_SORT,  T,  U,  Label,SEMIRING_t><<<gridDim, blockDim>>>(
+	blockOffsets, sharedRows, output_row_list_heads, output_row_count, chunks, chunk_alloc, chunk_pre_alloc, chunk_size,
+	chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, run_flag, restart_completion, shared_rows_handled, restart_offset, chunk_pointer_pos, semiring);
+}
+
+#define GPUCompressedMatrixMatrixMultiplyMergeSimple(TYPE, THREADS, BLOCKS_PER_MP, NNZPERTHREAD, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS) \
+	template void AcSpGEMMKernels::h_mergeSharedRowsSimple<NNZPERTHREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS, TYPE, uint32_t, uint32_t, false> \
+	(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, \
+	uint32_t* output_row_count, \
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, \
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, uint32_t restart_offset, uint32_t* chunk_pointer_pos); \
+	template void AcSpGEMMKernels::h_mergeSharedRowsSimple<NNZPERTHREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS, TYPE, uint32_t, uint32_t, true> \
+	(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, \
+	uint32_t* output_row_count, \
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, \
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \
+	uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, uint32_t restart_offset, uint32_t* chunk_pointer_pos);
+	
\ No newline at end of file
diff --git a/include/GALATIC/include/device/acSpGEMM_SpGEMM.cuh b/include/GALATIC/include/device/acSpGEMM_SpGEMM.cuh
new file mode 100644
index 00000000..f118e910
--- /dev/null
+++ b/include/GALATIC/include/device/acSpGEMM_SpGEMM.cuh
@@ -0,0 +1,1132 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * SpGEMM.cuh
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+#pragma once
+
+#include <cuda.h>
+#include <limits>
+#include <cub/cub.cuh>
+#include "MultiplyKernels.h"
+#include "Chunk.cuh"
+#include "HelperFunctions.cuh"
+#include "WorkDistribution.cuh"
+#include "ARowStorage.cuh"
+#include "SortAndCombine.cuh"
+
+
+//SORT_TYPE_MODE 0 .. 32bit direct, 1 32bit row remap, 2 64bit full
+template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD, uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE1, typename VALUE_TYPE2, typename VALUE_TYPE3, typename INDEX_TYPE, typename OFFSET_TYPE, int SORT_TYPE_MODE,
+        typename T, typename U, typename Label,
+        typename SEMIRING_t>
+        __global__ void __launch_bounds__(THREADS, BLOCKS_PER_MP)
+computeSpgemmPart(
+	const typename SEMIRING_t::leftInput_t* valA, const INDEX_TYPE* indicesA, const OFFSET_TYPE* __restrict offsetsA,
+	const typename SEMIRING_t::rightInput_t *__restrict valB, const INDEX_TYPE* __restrict indicesB, const OFFSET_TYPE* __restrict offsetsB,
+	const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows,
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	OFFSET_TYPE* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count,
+	uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv,
+	uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos, SEMIRING_t semiring)
+{
+	static_assert(RETAIN_ELEMENTS_PER_THREAD >= 1, "need at least one temporary element per thread to assure coalesced write out");
+	// fetch A data
+	//  tag with row and col ids
+	//
+	// fill work distribution
+	//
+	// fetch rows from B (each thread fetches one element)
+	//  multiply and sort in (multiply, sort, prefix sum)
+	//  run a scan to combine, compute row offset, and memory offset
+	//
+	// either write out chunk or keep all data/last row in shared memory to continue the combination
+
+	const int NNZ_PER_BLOCK = NNZ_PER_THREAD*THREADS;
+	const int TEMP_ITEMS_PER_BLOCK = (RETAIN_ELEMENTS_PER_THREAD*THREADS);
+
+	using LEFT_T = typename SEMIRING_t::leftInput_t;
+	using RIGHT_t = typename SEMIRING_t::rightInput_t;
+
+	//SORT_TYPE_MODE 0 .. 32bit direct, 1 32bit row remap, 2 64bit full
+	using SortType = ChooseBitDataType<(SORT_TYPE_MODE > 1 ) ? 64 : 32>;
+
+	const uint32_t ChunkSortingBits = (sizeof(ChunkSortType) * 8) - count_clz<NNZ_PER_BLOCK>::value;
+
+	// the number of elements each threads handles in registers
+	const int CombineElements = INPUT_ELEMENTS_PER_THREAD + RETAIN_ELEMENTS_PER_THREAD;
+
+	// cutoff for rows in B which will directly be forwarded to the merge stage
+	const uint32_t LongRowCutOff = CombineElements * THREADS / 2;
+
+	// used data types specialized for the setup
+	using RowelementWorkDistribution = WorkDistribution<THREADS, NNZ_PER_THREAD>;
+	using SortAndCombiner = SortAndCombine<SortType, typename SEMIRING_t::output_t, THREADS, CombineElements>;
+	using ScanCombinerEntry = typename SortAndCombiner::ScanCombinerEntry;
+	using SimpleScan = cub::BlockScan<uint32_t, THREADS>;
+	using SimpleIntScan = cub::BlockScan<int32_t, THREADS>;
+	using Chunk = Chunk<typename SEMIRING_t::output_t, INDEX_TYPE>;
+	using DirectChunk = DirectChunk<typename SEMIRING_t::leftInput_t, typename SEMIRING_t::rightInput_t, INDEX_TYPE>;
+
+	using ARowStorage = ARowStorage<INDEX_TYPE, NNZ_PER_BLOCK, THREADS, SORT_TYPE_MODE == 1>;
+	struct SMem
+	{
+   
+		// flattened out A data
+		//INDEX_TYPE A_row_ids[NNZ_PER_BLOCK];
+		uint32_t chunk_pointer_position, chunk_counter;
+		ARowStorage A_row_ids;
+		INDEX_TYPE A_col_ids[NNZ_PER_BLOCK];
+        typename SEMIRING_t::leftInput_t A_indata[NNZ_PER_BLOCK];
+
+
+
+
+		// comb data
+		union {
+			struct {
+				INDEX_TYPE current_col_ids[TEMP_ITEMS_PER_BLOCK];
+				typename ARowStorage::EncodedRowType current_row_ids[TEMP_ITEMS_PER_BLOCK < THREADS ? THREADS + 1 : TEMP_ITEMS_PER_BLOCK];
+				typename SEMIRING_t::output_t current_output[TEMP_ITEMS_PER_BLOCK];
+			};
+			struct {
+				uint32_t temp_work_storage_single[NNZ_PER_BLOCK];
+			};
+		};
+		
+		//TODO: temp mem and comb data could be overlapped!?
+
+		// temp mem
+		union {
+			struct {
+				typename RowelementWorkDistribution::SharedTempMemT workdistributionTempMem;
+				typename RowelementWorkDistribution:: template SharedTempMemOutT<CombineElements> workdistributionTempMemOutFull;
+			};
+			struct {
+				typename SimpleScan::TempStorage directChunkScanTempMem;
+				typename SimpleScan::TempStorage nonDirectChunkScanTempMem;
+			};
+			typename SimpleIntScan::TempStorage intScanTempMem;
+			typename SortAndCombiner::SMem sAndCMem;
+			INDEX_TYPE rowCounts[TEMP_ITEMS_PER_BLOCK];
+		};
+
+
+		//work distribution
+		typename RowelementWorkDistribution::SharedMemT workdistributionMem;
+
+		INDEX_TYPE minCol, maxCol;
+		typename ARowStorage::EncodedRowType minRow, maxRow;
+
+		uint32_t chunkStartOffset;
+		uint32_t firstRowCount;
+		uint32_t lastRowCount;
+		uint32_t runflag;
+		uint32_t directChunkRows;
+		uint32_t brokenChunkOffsetStart, brokenChunkOffsetEnd;
+
+		typename ARowStorage::EncodedRowType minBrokenChunkRow, maxBrokenChunkRow;
+	};
+
+	__shared__ SMem smem;
+
+	__shared__ uint32_t block_start_end[2];
+	//__shared__ int currentStartElementIndex, currentEndElementIndex;
+	//__shared__ uint32_t elem_handled_A, elem_handled_B, max_A, max_B, restart;
+	//__shared__ float lastExpected;
+	__shared__ int tempOffset, tempData, workavailable, consumedwork;
+
+	// get block data
+	if (threadIdx.x < 2)
+	{
+		block_start_end[threadIdx.x] = startingIdsA[blockIdx.x + threadIdx.x];
+		//smem.A_row_ids[0] = static_cast<INDEX_TYPE>(-1);
+		//currentEndElementIndex = completion_status[blockIdx.x];
+		//lastExpected = 0.0f;
+
+		// if we stopped globally, dont even start, otherwise consider restart
+		//if (threadIdx.x == 0 && completion_status[blockIdx.x] != 0 && completion_status[blockIdx.x] != 0xFFFFFFFF)
+		//	printf("%d restarting with %x %d\n", blockIdx.x, completion_status[blockIdx.x], completion_status[blockIdx.x] & (~0x80000000));
+		smem.chunk_pointer_position = 0;
+		smem.directChunkRows = 0;
+		smem.runflag = *run_flag != 0 ? 0xFFFFFFFF : completion_status[blockIdx.x];
+		smem.chunk_counter = chunk_counter[blockIdx.x];
+
+		// for consume based restart, set consumedwork too
+		consumedwork = (smem.runflag & 0x80000000) == 0 ? smem.runflag : 0;
+	}
+
+	smem.A_row_ids.clear();
+
+	__syncthreads();
+	if (smem.runflag == std::numeric_limits<uint32_t>::max())
+		return;
+
+	int worknnz = min(NNZ_PER_BLOCK, nnz - blockIdx.x * NNZ_PER_BLOCK);
+
+	// Assign column ids of a
+	//TODO: adjust num threads per row either dynamic (could be always pow 2) or a few preset static ones
+	for (uint32_t r = block_start_end[0] + threadIdx.x; r <= block_start_end[1]; r += THREADS)
+	{
+		int ain = static_cast<int>(offsetsA[r] - blockIdx.x * NNZ_PER_BLOCK);
+		int bin = offsetsA[min(rows, r + 1)] - blockIdx.x * NNZ_PER_BLOCK;
+
+		int a = max(0, ain);
+		int b = min(static_cast<int>(worknnz), bin);
+
+		//iterate over all threads that start with that row
+		if (a < b)
+		{
+			smem.A_row_ids.storeReference(a, r);
+			int ra = a;
+			smem.A_row_ids.storeRow(a, ra, r);
+			for (++a; a < b; ++a)
+				smem.A_row_ids.storeRow(a, ra, r);
+		}
+	}
+
+	__syncthreads();
+
+	bool directChunkRows = false;
+	int workToDistribute[NNZ_PER_THREAD];
+
+	// Read out lengths of rows from B for each element from A
+	#pragma unroll
+	for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i)
+	{
+		uint32_t w = threadIdx.x + i * THREADS;
+		INDEX_TYPE a_col = 0;
+		uint32_t b_num = 0;
+		
+
+		if (w < worknnz)
+		{
+			// normal case or work element based restart
+			bool load = true;
+			
+			if(load)
+			{
+				uint32_t l = w + blockIdx.x * NNZ_PER_BLOCK;
+				a_col = indicesA[l];
+				b_num = offsetsB[a_col + 1] - offsetsB[a_col];
+
+				smem.A_col_ids[w] = indicesA[l];
+				smem.A_indata[w] = valA[l];
+
+				// Long rows are directly referred to the merge stage by only writing an identifier chunck info
+				if (b_num >= LongRowCutOff)
+				{
+					// remember that we are now deadling with a dirct chunk row, which needs sorting
+					b_num = b_num | 0x80000000;
+					directChunkRows = true;
+				}
+				else if ((smem.runflag & 0x80000000) != 0)
+				{
+					// row based restart needs to set the consumed work too
+					uint32_t to_start_row = smem.A_row_ids.restartRowDecode((smem.runflag & (~0x80000000)), block_start_end[0]);
+					if (smem.A_row_ids.getEncodedRow(w) < to_start_row)
+					{
+						//printf("%d %d load  %x\n", blockIdx.x, threadIdx.x, completion_status[blockIdx.x]);
+						atomicAdd(&consumedwork, b_num);
+						b_num = 0;
+					}
+				}
+			}
+		}
+		workToDistribute[i] = b_num;
+	}
+
+	// move all direct chunk rows to the front so we can quickly identify them later
+	if (__syncthreads_or(directChunkRows))
+	{
+		// only write out during first run
+		if (smem.runflag == 0)
+		{
+			uint32_t chunkoff[NNZ_PER_THREAD];
+			bool success = true;
+
+			#pragma unroll
+			for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i)
+			{
+				// alloc special chunk and write out
+				if ((workToDistribute[i] & 0x80000000) != 0)
+				{
+					//FIXME: This is the wrong typez
+				//	printf("%d %d allocating direct chunk for size %d\n", blockIdx.x, threadIdx.x, (workToDistribute[i] & (~0x80000000)));
+					if (!allocDirectChunk<typename SEMIRING_t::leftInput_t, typename SEMIRING_t::rightInput_t, INDEX_TYPE>(chunk_alloc, chunk_size, chunkoff[i]))
+					{
+						success = false;
+						atomicOr(run_flag, 0x1);
+					}
+					atomicAdd(&(smem.chunk_pointer_position), 1);
+				}
+			}
+			if (__syncthreads_or(!success))
+			{
+				//re start with old state and alloc all chunks in next run
+				return;
+			}
+
+			if (threadIdx.x == 0)
+			{
+				uint32_t num_chunks = smem.chunk_pointer_position;
+				smem.chunk_pointer_position = atomicAdd(chunk_pointer_alloc, num_chunks);
+				if (smem.chunk_pointer_position + num_chunks >= chunk_pointer_sizes)
+				{
+					success = false;
+					atomicOr(run_flag, 0x2);
+					if(smem.chunk_pointer_position < chunk_pointer_sizes)
+						*chunk_pointer_pos = smem.chunk_pointer_position;
+				}
+			}
+			if (__syncthreads_or(!success))
+			{
+				//re start with old state and alloc all chunks in next run
+				return;
+			}
+
+			#pragma unroll
+			for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i)
+			{
+				if ((workToDistribute[i] & 0x80000000) != 0)
+				{
+				//	printf("%d %d added DirectChunk for row \n", blockIdx.x, threadIdx.x);
+
+					// write chunk data
+					DirectChunk * p_chunk = DirectChunk::cast(chunks, chunkoff[i]);
+					chunks_pointers[atomicAdd(&(smem.chunk_pointer_position), 1)] = reinterpret_cast<void*>(p_chunk);
+
+					uint32_t w = threadIdx.x + i * THREADS;
+					auto encodedRow = smem.A_row_ids.getEncodedRow(w);
+					INDEX_TYPE r = smem.A_row_ids.decodeRow(encodedRow);
+					INDEX_TYPE a_col = smem.A_col_ids[w];
+					uint32_t b_num = workToDistribute[i] & (~0x80000000);
+					DirectChunk::place(chunks, chunkoff[i], b_num, r, indicesB + offsetsB[a_col], valB + offsetsB[a_col], smem.A_indata[w], (static_cast<ChunkSortType>(blockIdx.x) << ChunkSortingBits) | (threadIdx.x + i*THREADS + NNZ_PER_BLOCK));
+					addPotentiallySharedRow(r, p_chunk, true, output_row_list_heads, shared_rows_tracker, shared_rows_alloc, true);
+
+					// if ((r == 0))
+						// printf("We have a direct chunk in row: %u with %u elements with col: %u\n", r, b_num, a_col);
+
+					atomicAdd(output_row_chunk_count + r, 1);
+					// mark so we do not go through simple merge
+					if (INPUT_ELEMENTS_PER_THREAD * THREADS * MERGE_MAX_PATH_OPTIONS >= b_num)
+					{
+						// Set both top most bits if this can go to max chunks case
+						atomicOr(output_row_chunk_count + r, MAX_CHUNKS_CASE);
+					}						
+					else
+					{
+						// Only set the topmost bit if this should go to the generalized case
+						atomicOr(output_row_chunk_count + r, GENERALIZED_CASE);
+					}
+						
+
+					//no need to set count, as we will go through max or general merge anyway
+			//    	atomicAdd(output_row_count + r, CombineElements * THREADS);
+					
+				}
+			}
+		}
+		
+		
+		#pragma unroll
+		for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i)
+			smem.temp_work_storage_single[threadIdx.x + i * THREADS] = ((workToDistribute[i] & 0x80000000) != 0) ? 0xFFFFFFFF : workToDistribute[i];
+		__syncthreads();
+
+		// run a prefix sum to figure out where to place the direct chunk row ids and others
+		uint32_t direct[NNZ_PER_THREAD], nonDirect[NNZ_PER_THREAD];
+		for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i)
+		{
+			// note stripped layout
+			if (smem.temp_work_storage_single[threadIdx.x * NNZ_PER_THREAD + i] == 0xFFFFFFFF)
+			{
+				direct[i] = 1;
+				nonDirect[i] = 0;
+			}
+			else
+			{
+				direct[i] = 0;
+				nonDirect[i] = 1;
+			}
+		}
+		uint32_t sum_direct;
+		SimpleScan(smem.directChunkScanTempMem).ExclusiveSum(direct, direct, sum_direct);
+		SimpleScan(smem.nonDirectChunkScanTempMem).ExclusiveSum(nonDirect, nonDirect);
+
+		INDEX_TYPE a_col[NNZ_PER_THREAD];
+		VALUE_TYPE1 a_vals[NNZ_PER_THREAD];
+		typename ARowStorage::EncodedRowType a_rowIds[NNZ_PER_THREAD];
+
+		//fetch the data
+		#pragma unroll
+		for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i)
+		{
+			// note stripped layout
+			int r = threadIdx.x * NNZ_PER_THREAD + i;
+			a_col[i] = smem.A_col_ids[r];
+			a_vals[i] = smem.A_indata[r];
+			a_rowIds[i] = smem.A_row_ids.getEncodedRow(r);
+			workToDistribute[i] = smem.temp_work_storage_single[r];
+		}
+		__syncthreads();
+
+		//store shuffled and cleared workload
+		#pragma unroll
+		for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i)
+		{
+
+			// note stripped layout
+			uint32_t p = nonDirect[i] + sum_direct;
+			if (workToDistribute[i] == 0xFFFFFFFF)
+			{
+				workToDistribute[i] = 0;
+				p = direct[i];
+			}
+
+			smem.A_col_ids[p] = a_col[i];
+			smem.A_indata[p] = a_vals[i];
+			smem.A_row_ids.storeEncodedRow(p, a_rowIds[i]);
+			smem.temp_work_storage_single[p] = workToDistribute[i];
+		}
+
+		smem.directChunkRows = sum_direct;
+		__syncthreads();
+
+		// load new work
+		#pragma unroll
+		for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i)
+			workToDistribute[i] = smem.temp_work_storage_single[threadIdx.x * NNZ_PER_THREAD + i];
+
+		// Initialize the work distribution from stripped layout
+		RowelementWorkDistribution:: template initialize<true>(smem.workdistributionMem, smem.workdistributionTempMem, workToDistribute);
+	}
+	else
+	{
+		// Initialize the work distribution from blocked layoyt and run while work is available
+		RowelementWorkDistribution:: template initialize<false>(smem.workdistributionMem, smem.workdistributionTempMem, workToDistribute);
+	}
+
+	// now kept in shared
+	tempData = 0;
+	tempOffset = 0;
+
+
+	// comsume based restart
+	if (smem.runflag != 0 && (smem.runflag & 0x80000000) == 0)
+	{
+		RowelementWorkDistribution::removework(smem.workdistributionMem, smem.runflag);
+	}
+	
+
+	// note: potential race condition with removework, however the entire work will never be removed and we only compare with > 0 -> fine?
+	// TODO: -> we can remove syncthreads!?
+	__syncthreads();
+	
+	workavailable = RowelementWorkDistribution::workAvailable(smem.workdistributionMem);
+	while (workavailable > 0)
+	{
+
+		int localAEntry[CombineElements];
+		int elementB[CombineElements];
+
+		int elements = RowelementWorkDistribution:: template assignWorkAllThreads<false, CombineElements>(
+			smem.workdistributionMem, smem.workdistributionTempMem, smem.workdistributionTempMemOutFull,
+			localAEntry, elementB, CombineElements*THREADS - tempData);
+
+		if(threadIdx.x == 0)
+			consumedwork += CombineElements*THREADS - tempData;
+
+
+		typename ARowStorage::EncodedRowType temp_row[CombineElements];
+		INDEX_TYPE temp_col_id[CombineElements];
+		typename SEMIRING_t::output_t temp_val[CombineElements];
+
+		smem.minCol = smem.minRow = std::numeric_limits<INDEX_TYPE>::max();
+		smem.maxCol = smem.maxRow = 0;
+
+		// locel min/max row and col
+		INDEX_TYPE minRow = std::numeric_limits<INDEX_TYPE>::max(), maxRow = 0;
+		INDEX_TYPE minCol = std::numeric_limits<INDEX_TYPE>::max(), maxCol = 0;
+		
+
+
+		//fetch B data and set MIN/MAX values for how many rows in A and how many cols and B are touched
+		#pragma unroll
+		for (int i = 0; i < CombineElements; ++i)
+		{
+			if (i < elements)
+			{
+				uint32_t aentry = localAEntry[i];
+				uint32_t fetch_row = smem.A_col_ids[aentry];
+				temp_row[i] = smem.A_row_ids.getEncodedRow(aentry);
+				minRow = min(minRow, temp_row[i]);
+				maxRow = max(maxRow, temp_row[i]);
+			
+
+				//if (elementB[i] < 0 || aentry >= worknnz)
+				//	printf("%d %d [%d]: max %d - nnz: %d - req: %d/%d - %d %d\n", blockIdx.x, threadIdx.x, i, elements, worknnz, CombineElements*THREADS - tempData, workavailable, localAEntry[i], elementB[i]);
+
+				INDEX_TYPE elb = offsetsB[fetch_row] + elementB[i];
+				temp_col_id[i] = indicesB[elb];
+				temp_val[i] = semiring.multiply(smem.A_indata[aentry], valB[elb]);
+
+				minCol = min(minCol, temp_col_id[i]);
+				maxCol = max(maxCol, temp_col_id[i]);
+			}
+			else
+			{
+				// get from last iteration
+				int t = i * THREADS + threadIdx.x - (CombineElements*THREADS - tempData);
+				if (t >= 0)
+				{
+					int access = (tempOffset + t) % TEMP_ITEMS_PER_BLOCK;
+					// offset tells us where the last row data is currently placed
+					temp_row[i] = smem.current_row_ids[access];
+					temp_col_id[i] = smem.current_col_ids[access];
+					temp_val[i] = smem.current_output[access];
+
+				//	printf("%d %d (%d %d %d): %d %d %f\n", blockIdx.x, threadIdx.x, access, t, tempData, temp_row[i], temp_col_id[i], temp_val[i]);
+
+					minRow = min(minRow, temp_row[i]);
+					maxRow = max(maxRow, temp_row[i]);
+
+					minCol = min(minCol, temp_col_id[i]);
+					maxCol = max(maxCol, temp_col_id[i]);
+
+					//dummy value to indicate that we have something
+					elementB[i] = 1;
+				}
+				else
+					//indicate that we are empty
+					elementB[i] = -1;
+			}
+		}
+
+		//
+		updateMinValue(smem.minCol, minCol);
+		updateMinValue(smem.minRow, minRow);
+		updateMaxValue(smem.maxCol, maxCol);
+		updateMaxValue(smem.maxRow, maxRow);
+
+		__syncthreads();
+
+
+		INDEX_TYPE colRange = smem.maxCol - smem.minCol;
+		INDEX_TYPE rowRange = smem.maxRow - smem.minRow + 1;
+		INDEX_TYPE colBits = 32 - __clz(colRange);
+		INDEX_TYPE rowBits = 32 - __clz(rowRange);
+
+
+		if (colBits + rowBits > 32 && threadIdx.x == 0)
+		{
+			printf("colRange: %u rowRange: %u colBits: %u rowBits: %u | minCol: %u maxCol: %u | minRow: %u maxRow: %u\n", colRange, rowRange, colBits, rowBits, smem.minCol, smem.maxCol, smem.minRow, smem.maxRow);
+			//return;
+		}
+
+		ScanCombinerEntry combinedEntries[CombineElements];
+		{
+			//TODO: if there are fewer items only, we want to only sort those...
+			//TODO: if we can use uint32_t instead of uint64_t we want to use that...
+			SortType combIndex[CombineElements];
+			typename SEMIRING_t::output_t data[CombineElements];
+			#pragma unroll
+			for (int i = 0; i < CombineElements; ++i)
+			{
+				if (elementB[i] >= 0)
+				{
+					combIndex[i] = (static_cast<SortType>(temp_row[i] - smem.minRow) << colBits) | (temp_col_id[i] - smem.minCol);
+					data[i] = temp_val[i];
+				}
+				else
+				{
+					combIndex[i] = ~SortType(0);
+					data[i] = SEMIRING_t::AdditiveIdentity();
+				}
+			}
+
+			tempData = SortAndCombiner::combine(smem.sAndCMem, combIndex, data, combinedEntries,
+				[](auto a, auto b) {
+				return a == b;
+			},
+				[colBits](auto a, auto b) {
+				return (a >> colBits) == (b >> colBits);
+			}, semiring,
+				colBits + rowBits);
+
+		}
+
+
+		workavailable = RowelementWorkDistribution::workAvailable(smem.workdistributionMem);
+
+		//we would like to know how many elements we have from the last row
+		// TODO: check if that is right
+		#pragma unroll
+		for (int i = 0; i < CombineElements; ++i)
+			if (combinedEntries[i].isRowend() && combinedEntries[i].memoffset() == tempData - 1)
+				smem.lastRowCount = combinedEntries[i].rowcount();
+
+		__syncthreads();
+
+		// if (threadIdx.x == 0)
+		// 	printf("%d decision to make: %d >= %d || !%d || 8 * %d < %d\n", blockIdx.x, tempData, TEMP_ITEMS_PER_BLOCK, workavailable, smem.lastRowCount, tempData);
+
+		// TODO: check heuristic
+		// if we must go out or if the last row is very small in comparison to the other data
+		if (tempData >= TEMP_ITEMS_PER_BLOCK || workavailable <= 0 || 1* smem.lastRowCount < tempData)
+		{
+			// keep the last row around if we can so we reduce the amount of merging we have to perform
+			int allocData = workavailable > 0 && smem.lastRowCount < TEMP_ITEMS_PER_BLOCK ? tempData - smem.lastRowCount : tempData;
+
+			// determine how many chunks we need to generate (additional ones for single row chunks in between)
+			bool multiChunk = false;
+
+			if (smem.directChunkRows != 0)
+			{
+				for (uint32_t i = threadIdx.x; i < smem.directChunkRows; i += THREADS)
+					multiChunk = multiChunk || (smem.minRow < smem.A_row_ids.getEncodedRow(i) && smem.A_row_ids.getEncodedRow(i) < smem.maxRow);
+
+				multiChunk = __syncthreads_or(multiChunk);
+			}
+
+
+			if (multiChunk)
+			{
+				// we need to separate the output into multiple chunks
+				//if (threadIdx.x == 0 && (smem.maxRow == 7094 || smem.maxRow == 6025 || smem.maxRow == 5086 || smem.maxRow == 5273 || smem.maxRow == 7350))
+				//	printf("%d %d split chunk for %d-%d .. %d %d\n", blockIdx.x, threadIdx.x, smem.minRow, smem.maxRow, allocData, tempData);
+
+				// init smem
+				smem.brokenChunkOffsetStart = 0;
+				smem.minBrokenChunkRow = smem.minRow;
+				smem.maxBrokenChunkRow = smem.maxRow;
+
+
+				// determine individual chunk ends
+				// iterate over shared rows list and my data to see how many chunk boundaries i need to add
+				// need access to the next element -> store in shared
+				smem.current_row_ids[threadIdx.x+1] = (combinedEntries[CombineElements-1].index >> colBits) + smem.minRow;
+				smem.current_row_ids[0] = smem.minRow;
+
+				__syncthreads();
+				uint32_t chunk_splitting_row_id = 0;
+				uint32_t chunk_splitting_row = smem.A_row_ids.getEncodedRow(chunk_splitting_row_id);
+				typename ARowStorage::EncodedRowType r = smem.current_row_ids[threadIdx.x];
+				// search for the first chunk breaking row that is larger than the row handled by the previous thread
+				// ie find the first chunk breaking row that can be relevant for my entries
+				while (chunk_splitting_row <= r)
+				{
+					if (++chunk_splitting_row_id < smem.directChunkRows)
+					{
+						chunk_splitting_row = smem.A_row_ids.getEncodedRow(chunk_splitting_row_id);
+					}
+					else
+					{
+						// this threads entries are above all chunk breaking rows, so set it to max
+						chunk_splitting_row = smem.maxRow + 1;
+						break;
+					}
+				}
+
+				//if (threadIdx.x == 0 && smem.maxRow == 7094)
+				//{
+				//	printf("Min: %u and Max: %u\n", smem.minRow, smem.maxRow);
+				//}
+
+				//if (/*threadIdx.x == 0 &&*/ smem.maxRow == 7094)
+				//{
+				//	printf("Chunk splitting row: %u with r: %u\n", chunk_splitting_row, r);
+				//}
+
+				// determine where to break
+				static_assert(CombineElements <= 32, "can handle a maximum of 32 CombinedElements when performing multi chunk out");
+				uint32_t chunk_breaks = 0;
+				#pragma unroll
+				for (int i = 0; i < CombineElements; ++i)
+				{
+					typename ARowStorage::EncodedRowType next_r = min(static_cast<typename ARowStorage::EncodedRowType>((combinedEntries[i].index >> colBits)) + smem.minRow, smem.maxRow);
+					/*if (smem.maxRow == 7094 && r < 7100 && next_r < 7100 && r != next_r)
+					{
+						printf("Row given: %u | %u nextrow\n", r, next_r);
+					}*/
+					/*if (r != next_r && chunk_splitting_row <= next_r && (next_r == smem.maxRow) && chunk_splitting_row != smem.maxRow)
+					{
+						printf("R: %u | next_R: %u | chunk_splitting: %u | max: %u ----- directid: %u maxid: %u\n", r, next_r, chunk_splitting_row, smem.maxRow, chunk_splitting_row_id, smem.directChunkRows);
+					}*/
+					/*if (r != next_r && chunk_splitting_row <= next_r && next_r != smem.maxRow)*/
+					/*if (r != next_r && chunk_splitting_row <= next_r && chunk_splitting_row != smem.maxRow && tempData == allocData)*/
+					/*if (r != next_r && chunk_splitting_row <= next_r && (next_r != smem.maxRow || next_r == 7094 || next_r == 6025))*/
+					if (r != next_r && chunk_splitting_row <= next_r && (next_r != smem.maxRow || (chunk_splitting_row != smem.maxRow && tempData == allocData)))
+					{
+						// we are at a chunk boundary
+						chunk_breaks |= (1 << i);
+						//if(smem.maxRow == 7094)
+						//	printf("%d %d breaks chunk between %d %d\n", blockIdx.x, threadIdx.x, r, next_r);
+						// find next
+						do
+						{
+							if (++chunk_splitting_row_id < smem.directChunkRows)
+								chunk_splitting_row = smem.A_row_ids.getEncodedRow(chunk_splitting_row_id);
+							else
+							{
+								chunk_splitting_row = smem.maxRow + 1;
+								break;
+							}
+						} while (chunk_splitting_row <= next_r);
+					}
+					r = next_r;
+				}
+
+				// run prefix sum to figure out how many chunk breaks to insert
+				int num_broken_chunks[1] = { __popc(chunk_breaks) };
+				int overall_broken_chunk, my_starting_offset[1];
+				SimpleIntScan(smem.intScanTempMem).ExclusiveSum(num_broken_chunks, my_starting_offset, overall_broken_chunk);
+
+
+				//if (threadIdx.x == 0 && smem.maxRow == 7094)
+				//	printf("%d %d overall broken chunks: %d\n", blockIdx.x, threadIdx.x, overall_broken_chunk);
+
+				// iterate over broken up chunks and write out in the typical manner
+				for (int c = 0; c <= overall_broken_chunk; ++c)
+				{
+					__syncthreads();
+					int local_chunk = c - my_starting_offset[0];
+					if (local_chunk >= 0 && local_chunk < num_broken_chunks[0])
+					{
+						// it is our chunk - extract
+						int handled_bits = 0;
+						#pragma unroll
+						for (int i = 0; i < CombineElements; ++i)
+						{
+							if ((chunk_breaks & (1 << i)) != 0)
+							{
+								if (handled_bits == local_chunk)
+								{
+									if (combinedEntries[i].isResult())
+										smem.brokenChunkOffsetEnd = combinedEntries[i].memoffset();
+									else
+										smem.brokenChunkOffsetEnd = combinedEntries[i].memoffset() + 1;
+									//printf("%d %d its my chunk time %d: %d\n", blockIdx.x, threadIdx.x, i, combinedEntries[i].memoffset());
+								}
+								++handled_bits;
+							}
+						}
+					}
+					__syncthreads();
+
+					if(threadIdx.x == 0)
+					{
+						if (c == overall_broken_chunk)
+						{
+							// need to setup last chunk
+							smem.brokenChunkOffsetEnd = smem.brokenChunkOffsetStart + tempData;
+							//printf("%d %d its last chunk time: %d\n", blockIdx.x, threadIdx.x, smem.brokenChunkOffsetStart + tempData);
+						}
+						/*if (threadIdx.x == 0 && smem.maxRow == 1878)
+							printf("We have allocData %u and other %u\n", allocData, smem.brokenChunkOffsetEnd - smem.brokenChunkOffsetStart);*/
+						uint32_t chunkoff = completeChunkAlloc<typename SEMIRING_t::output_t, INDEX_TYPE>(min(smem.brokenChunkOffsetEnd - smem.brokenChunkOffsetStart, allocData), chunks, chunk_alloc, chunk_size, chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, chunk_pointer_pos,
+							[&]()
+						{
+							atomicOr(run_flag, 0x1);
+							//if(threadIdx.x == 0)
+
+							// Write out descriptor for restart into global
+							//printf("%d going for restart: %x: %d -> %d -- block row range: %d<->%d\n", blockIdx.x, smem.runflag, (smem.runflag&(~0x80000000)), smem.A_row_ids.decodeRow(smem.A_row_ids.restartRowDecode((smem.runflag & (~0x80000000)), block_start_end[0])), block_start_end[0], block_start_end[1]);
+							completion_status[blockIdx.x] = smem.runflag;
+							chunk_counter[blockIdx.x] = smem.chunk_counter;
+						}, [&]()
+						{
+							atomicOr(run_flag, 0x2);
+							// Write out descriptor for restart into global
+							//if(threadIdx.x == 0)
+							//printf("%d going for restart: %x: %d -> %d -- block row range: %d<->%d\n", blockIdx.x, smem.runflag, (smem.runflag&(~0x80000000)), smem.A_row_ids.decodeRow(smem.A_row_ids.restartRowDecode((smem.runflag & (~0x80000000)), block_start_end[0])), block_start_end[0], block_start_end[1]);
+							completion_status[blockIdx.x] = smem.runflag;
+							chunk_counter[blockIdx.x] = smem.chunk_counter;
+						});
+						smem.chunkStartOffset = chunkoff;
+					}
+
+					__syncthreads();
+					if (smem.chunkStartOffset == 0xFFFFFFFF)
+						return;
+
+					smem.firstRowCount = 0;
+
+					int num = min(smem.brokenChunkOffsetEnd - smem.brokenChunkOffsetStart, allocData);
+
+					allocData -= num;
+
+					// write data for this chunk to smem and write out
+					for (uint32_t written = smem.brokenChunkOffsetStart; written < smem.brokenChunkOffsetEnd; written += TEMP_ITEMS_PER_BLOCK)
+					{
+						//store in shared for coalesced out
+						#pragma unroll
+						for (int i = 0; i < CombineElements; ++i)
+						{
+							uint32_t poffset = combinedEntries[i].memoffset();
+							if (combinedEntries[i].isResult() && poffset >= written && poffset < written + TEMP_ITEMS_PER_BLOCK)
+							{
+								uint32_t pwrite = poffset - written;
+								INDEX_TYPE col = (combinedEntries[i].index & ((1u << colBits) - 1)) + smem.minCol;
+								typename ARowStorage::EncodedRowType row = (combinedEntries[i].index >> colBits) + smem.minRow;
+								smem.current_col_ids[pwrite] = col;
+								smem.current_row_ids[pwrite] = row;
+								smem.current_output[pwrite] = combinedEntries[i].value;
+								smem.rowCounts[pwrite] = combinedEntries[i].isRowend() ? combinedEntries[i].rowcount() : 0;
+							}
+						}
+
+						__syncthreads();
+
+						#pragma unroll
+						for (int i = 0; i < RETAIN_ELEMENTS_PER_THREAD; ++i)
+						{
+							//write out
+							INDEX_TYPE rid;
+							int writeout = written + i * THREADS + threadIdx.x - smem.brokenChunkOffsetStart;
+							if (writeout < num)
+							{
+								typename SEMIRING_t::output_t* valstart = Chunk::cast(chunks, smem.chunkStartOffset)->values_direct(num);
+								INDEX_TYPE* indexstart = Chunk::cast(chunks, smem.chunkStartOffset)->indices_direct(num);
+								valstart[writeout] = smem.current_output[i * THREADS + threadIdx.x];
+								indexstart[writeout] = smem.current_col_ids[i * THREADS + threadIdx.x];
+								rid = smem.current_row_ids[i * THREADS + threadIdx.x];
+								// if (rid >= rows) {
+								// 	printf("%d %d rid bad row read %d \n",blockIdx.x, threadIdx.x , rid);
+								// }
+								if (smem.A_row_ids.decodeRow(rid) == 1878)
+								{
+									/*if(smem.current_col_ids[i * THREADS + threadIdx.x] == 0)
+										printf("ChunkStartOffset: %u with num: %u\n", smem.chunkStartOffset, num);
+									printf("Row %u: %u\n", smem.A_row_ids.decodeRow(rid), smem.current_col_ids[i * THREADS + threadIdx.x]);*/
+								}
+								if (writeout == num - 1)
+								{
+									smem.maxBrokenChunkRow = rid;
+									smem.lastRowCount = smem.rowCounts[i * THREADS + threadIdx.x];
+								}
+							}
+							else
+								rid = std::numeric_limits<INDEX_TYPE>::max();
+
+							uint32_t rcount = smem.rowCounts[i * THREADS + threadIdx.x];
+							if (rcount != 0 && rid != std::numeric_limits<INDEX_TYPE>::max())
+							{
+								//write row count
+								if (smem.firstRowCount == 0 && rid == smem.current_row_ids[0])
+								{
+									smem.minBrokenChunkRow = rid;
+									smem.firstRowCount = rcount;
+								}
+								if ((smem.A_row_ids.decodeRow(rid) == 1878) /*|| (smem.A_row_ids.decodeRow(rid) == 11614) || (smem.A_row_ids.decodeRow(rid) == 14759) || (smem.A_row_ids.decodeRow(rid) == 14767) || (smem.A_row_ids.decodeRow(rid) == 11125)*/)
+									printf("Adding count: %u to row %u\n", rcount, (smem.A_row_ids.decodeRow(rid)));
+								atomicAdd(output_row_count + smem.A_row_ids.decodeRow(rid), rcount);
+							}
+						}
+						__syncthreads();
+					}
+
+					// last is shared if we are in a broken chunk (allocData > 0) or if we write out the last completely
+					bool shared_last = (allocData > 0 || tempData == num) && smem.minBrokenChunkRow != smem.maxBrokenChunkRow;
+					if (threadIdx.x < (shared_last ? 2 : 1))
+					{
+						
+						//write header
+						/*if(smem.A_row_ids.decodeRow(smem.minBrokenChunkRow) <= 2605 && smem.A_row_ids.decodeRow(smem.maxBrokenChunkRow) >= 2605)*/
+							/*printf("%d %d broken writing header: %d<->%d  .%d %d.  (%d/%d/%d)\n", blockIdx.x, threadIdx.x,
+							smem.A_row_ids.decodeRow(smem.minBrokenChunkRow), smem.A_row_ids.decodeRow(smem.maxBrokenChunkRow), smem.firstRowCount, smem.lastRowCount, allocData, num, tempData);*/
+
+						Chunk::place(chunks, smem.chunkStartOffset, num, smem.A_row_ids.decodeRow(smem.minBrokenChunkRow), smem.firstRowCount, smem.lastRowCount, (static_cast<ChunkSortType>(blockIdx.x) << ChunkSortingBits) | (smem.chunk_counter + threadIdx.x));
+
+						bool minrow = threadIdx.x == 0 && smem.minBrokenChunkRow != smem.maxBrokenChunkRow;
+						uint32_t r = smem.A_row_ids.decodeRow(minrow ? smem.minBrokenChunkRow : smem.maxBrokenChunkRow);
+						Chunk* c = Chunk::cast(chunks, smem.chunkStartOffset);
+
+						/*printf("%d %d adding shared row: %d first: %d - for encoded rows %d %d\n", blockIdx.x, threadIdx.x, r, minrow, smem.minBrokenChunkRow, smem.maxBrokenChunkRow);*/
+						addPotentiallySharedRow(r, c, minrow, output_row_list_heads, shared_rows_tracker, shared_rows_alloc);
+						atomicAdd(output_row_chunk_count + r, 1);
+
+						// set new local restart information
+						smem.runflag = tempData == num ? consumedwork : (0x80000000 | (smem.A_row_ids.restartRowEncode(smem.maxBrokenChunkRow, block_start_end[0]) + 1));
+
+						//printf("%d %d updating tempData %d -= %d -> %d  and temp offset: %d\n", blockIdx.x, threadIdx.x, tempData, num, tempData - num, num % TEMP_ITEMS_PER_BLOCK);
+
+						smem.brokenChunkOffsetStart = smem.brokenChunkOffsetEnd;
+
+						//reset count
+						tempData = tempData - num;
+						tempOffset = num % TEMP_ITEMS_PER_BLOCK;
+						if (threadIdx.x == 0)
+							smem.chunk_counter += (shared_last ? 2 : 1);
+					}
+				}
+			}
+			else
+			{
+				//if (threadIdx.x == 0)
+				//	printf("%d %d normal chunk for %d-%d\n", blockIdx.x, threadIdx.x, smem.minRow, smem.maxRow);
+				if (threadIdx.x == 0)
+				{
+					uint32_t chunkoff = completeChunkAlloc<typename SEMIRING_t::output_t, INDEX_TYPE>(allocData, chunks, chunk_alloc, chunk_size, chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, chunk_pointer_pos,
+						[&]()
+						{
+							atomicOr(run_flag, 0x1);
+							// Write out descriptor for restart into global
+							completion_status[blockIdx.x] = smem.runflag;
+							chunk_counter[blockIdx.x] = smem.chunk_counter;
+						},
+						[&]()
+						{
+							atomicOr(run_flag, 0x2);
+							// Write out descriptor for restart into global
+							completion_status[blockIdx.x] = smem.runflag;
+							chunk_counter[blockIdx.x] = smem.chunk_counter;
+						});
+
+					smem.chunkStartOffset = chunkoff;
+				}
+				__syncthreads();
+				if (smem.chunkStartOffset == 0xFFFFFFFF)
+					return;
+
+				//      every first element in row -> run prefix sum to determine number of entries in row
+				//        not first or last, directly set count
+				//        first and last row for potential overlap
+				//          atomicMax at count
+				//          if 0 before -> alloc list element and atomic exchange with head and write info + next pointer into list
+				//            if the head was non-zero (second list element, add shared row entry:
+				//              atomicAdd for alloc and write row
+				//    : add first row in chunk to beginning of chunk
+				//      add numentires to chunk
+				//      add offset to data and column ids to chunk info
+				//      this info can be updated for shared rows when we extract stuff :)
+
+				smem.firstRowCount = 0;
+				//RowCounter rc(smem.rowcounterMem);
+
+				for (uint32_t written = 0; written < tempData; written += TEMP_ITEMS_PER_BLOCK)
+				{
+					//store in shared for coalesced out
+					#pragma unroll
+					for (int i = 0; i < CombineElements; ++i)
+					{
+						uint32_t poffset = combinedEntries[i].memoffset();
+						if (combinedEntries[i].isResult() && poffset >= written && poffset < written + TEMP_ITEMS_PER_BLOCK)
+						{
+							uint32_t pwrite = poffset - written;
+							INDEX_TYPE col = (combinedEntries[i].index & ((1u << colBits) - 1)) + smem.minCol;
+							typename ARowStorage::EncodedRowType row = (combinedEntries[i].index >> colBits) + smem.minRow;
+							//if (col > 21198119)
+							//	printf("%d %d merge fucked up col: %d: %llx %d+d\n", blockIdx.x, threadIdx.x, col, combinedEntries[i].index, uint32_t(combinedEntries[i].index & ((1u << colBits) - 1)), smem.minCol);
+							smem.current_col_ids[pwrite] = col;
+							smem.current_row_ids[pwrite] = row;
+							smem.current_output[pwrite] = combinedEntries[i].value;
+
+							//printf("%d %d entry %d: %d/%d %f\n", blockIdx.x, threadIdx.x, poffset, row, col, combinedEntries[i].value);
+
+							/*if (col < smem.minCol || col > smem.maxCol || row < smem.minRow || row > smem.maxRow || row >= rows || col >= rows)
+							{
+								printf("%d %d bad entry: %llx %d = %d + %d  (%d %d) %d (%d %d) - %d\n", blockIdx.x, threadIdx.x, combinedEntries[i].index, row, smem.minRow, (combinedEntries[i].index >> colBits), smem.minRow, smem.maxRow, col, smem.minCol, smem.maxCol, rows);
+								__trap();
+							}*/
+
+							smem.rowCounts[pwrite] = combinedEntries[i].isRowend() ? combinedEntries[i].rowcount() : 0;
+						}
+					}
+
+					__syncthreads();
+
+					#pragma unroll
+					for (int i = 0; i < RETAIN_ELEMENTS_PER_THREAD; ++i)
+					{
+						//write out
+						INDEX_TYPE rid;
+						uint32_t writeout = written + i * THREADS + threadIdx.x;
+						if (writeout < allocData)
+						{
+							typename SEMIRING_t::output_t* valstart = Chunk::cast(chunks, smem.chunkStartOffset)->values_direct(allocData);
+							INDEX_TYPE* indexstart = Chunk::cast(chunks, smem.chunkStartOffset)->indices_direct(allocData);
+							valstart[writeout] = smem.current_output[i * THREADS + threadIdx.x];
+							indexstart[writeout] = smem.current_col_ids[i * THREADS + threadIdx.x];
+							rid = smem.current_row_ids[i * THREADS + threadIdx.x];
+							//printf("row id %d", smem.current_row_ids[i * THREADS + threadIdx.x]);
+							//fixme?
+							// if ((rid >= rows || rid < 0) && rid != std::numeric_limits<INDEX_TYPE>::max() )
+							// 	printf("%d %d fffffffffffitting rid: %d %d allocdata: %d, %d\n", blockIdx.x, threadIdx.x, rid, rows,allocData,  std::numeric_limits<INDEX_TYPE>::max() - rid );
+
+						}
+						else
+						{
+							rid = std::numeric_limits<INDEX_TYPE>::max();
+							//fixme: suspicious  if theres an error, I thought I discarded these changes
+							// if ((rid >= rows || rid < 0) && rid != std::numeric_limits<INDEX_TYPE>::max() )
+							// 	printf("%d %d Eeeeeeeeeeeeeenonfitting rid: %d %d allocdata: %d\n", blockIdx.x, threadIdx.x, rid, rows,allocData);
+						}
+
+						uint32_t rcount = smem.rowCounts[i * THREADS + threadIdx.x];
+						if (rcount != 0 && rid < rows)
+						{
+							//write row count
+							//if (written + threadIdx.x == tempData - 1)
+							//	smem.lastRowCount = rcount;
+							if (smem.firstRowCount == 0 && rid == smem.current_row_ids[0])
+								smem.firstRowCount = rcount;
+
+							// if (rid >= rows || rid < 0)
+								// printf("%d %d nonfitting rid: %d %d allocdata: %d\n", blockIdx.x, threadIdx.x, rid, rows,allocData);
+							auto b = smem.A_row_ids.decodeRow(rid);
+							atomicAdd(output_row_count + b, rcount);
+						}
+					}
+					__syncthreads();
+				}
+
+				bool shared_last = tempData == allocData && smem.minRow != smem.maxRow;
+				if (threadIdx.x < (shared_last ? 2 : 1))
+				{
+					////write header
+					//if (smem.A_row_ids.decodeRow(smem.minRow) >= 2605 && smem.A_row_ids.decodeRow(smem.maxRow) <= 2605)
+					//printf("%d %d writing header: %d<->%d  .%d %d.  (%d/%d)\n", blockIdx.x, threadIdx.x,
+					//	smem.A_row_ids.decodeRow(smem.minRow), smem.A_row_ids.decodeRow(smem.maxRow), smem.firstRowCount, smem.lastRowCount, allocData, tempData);
+					Chunk::place(chunks, smem.chunkStartOffset, allocData, smem.A_row_ids.decodeRow(smem.minRow), smem.firstRowCount, smem.lastRowCount, (static_cast<ChunkSortType>(blockIdx.x) << ChunkSortingBits) | (smem.chunk_counter + threadIdx.x));
+
+
+					bool minrow = threadIdx.x == 0 && smem.minRow != smem.maxRow;
+					uint32_t r = smem.A_row_ids.decodeRow(minrow ? smem.minRow : smem.maxRow);
+					Chunk* c = Chunk::cast(chunks, smem.chunkStartOffset);
+
+					//printf("%6d %4d adding shared row: %6d first: %d with %5d \n", blockIdx.x, threadIdx.x, r, minrow, minrow ? smem.firstRowCount : smem.lastRowCount);
+					addPotentiallySharedRow(r, c, minrow, output_row_list_heads, shared_rows_tracker, shared_rows_alloc);
+					atomicAdd(output_row_chunk_count + r, 1);
+
+					// set new local restart information
+					smem.runflag = tempData == allocData ? consumedwork : (0x80000000 | (smem.A_row_ids.restartRowEncode(smem.maxRow, block_start_end[0])));
+
+					//printf("%d %d setting temp run flag to: %d == %d ? %d : (0x80000000 | %d) - %d -> %x %d\n", blockIdx.x, threadIdx.x, tempData, allocData, consumedwork, (smem.maxRow - block_start_end[0]), smem.maxRow, smem.runflag, smem.runflag & (~0x80000000));
+
+					//reset count
+					tempData = tempData - allocData;
+					tempOffset = allocData % TEMP_ITEMS_PER_BLOCK;
+					if (threadIdx.x == 0)
+						smem.chunk_counter += (shared_last ? 2 : 1);
+				}
+			}
+		}
+		else
+		{
+			// directly store to shared
+			#pragma unroll
+			for (int i = 0; i < CombineElements; ++i)
+			{
+				if (combinedEntries[i].isResult())
+				{
+					uint32_t poffset = combinedEntries[i].memoffset();
+					smem.current_col_ids[poffset] = (combinedEntries[i].index  & ((1u << colBits) - 1)) + smem.minCol;
+					smem.current_row_ids[poffset] = (combinedEntries[i].index >> colBits) + smem.minRow;
+					smem.current_output[poffset] = combinedEntries[i].value;
+				}
+			}
+
+			//if (threadIdx.x == 0)
+			//	printf("%d keep: %d->%d %d\n", blockIdx.x, smem.minRow, smem.maxRow, tempData);
+			tempOffset = 0;
+		}
+		__syncthreads();
+	}
+
+	if (threadIdx.x == 0)
+	{
+		// All done
+		completion_status[blockIdx.x] = 0xFFFFFFFF;
+	}
+}
+
+
+template<uint32_t NNZ_PER_THREAD, uint32_t THREADS, uint32_t BLOCKS_PER_MP, uint32_t INPUT_ELEMENTS_PER_THREAD, uint32_t RETAIN_ELEMENTS_PER_THREAD, uint32_t MERGE_MAX_PATH_OPTIONS, typename VALUE_TYPE1, typename VALUE_TYPE2, typename VALUE_TYPE3, typename INDEX_TYPE, typename OFFSET_TYPE, int SORT_TYPE_MODE,
+        typename T, typename U, typename Label,
+        typename SEMIRING_t>
+        void AcSpGEMMKernels::h_computeSpgemmPart(
+	const typename SEMIRING_t::leftInput_t* valA, const INDEX_TYPE* indicesA, const OFFSET_TYPE* __restrict offsetsA,
+	/*fixme const T2 -> */const typename SEMIRING_t::rightInput_t* __restrict valB, const INDEX_TYPE* __restrict indicesB, const OFFSET_TYPE* __restrict offsetsB,
+	const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows,
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size,
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes,
+	OFFSET_TYPE* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count,
+	uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv,
+	uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos, SEMIRING_t semiring)
+{
+	HANDLE_ERROR(cudaGetLastError());
+
+	computeSpgemmPart< NNZ_PER_THREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_PATH_OPTIONS, typename SEMIRING_t::leftInput_t, typename SEMIRING_t::rightInput_t, typename SEMIRING_t::output_t, INDEX_TYPE, OFFSET_TYPE, SORT_TYPE_MODE,  T,  U,  Label,SEMIRING_t> <<<gridDim, blockDim>>>
+		(valA, indicesA, offsetsA, valB, indicesB, offsetsB, startingIdsA, nnz, rows, chunks, chunk_alloc, chunk_worst_case, chunk_size,
+		chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, output_row_count, output_row_list_heads, output_row_chunk_count,
+		shared_rows_tracker, shared_rows_alloc, expected_row_overlap, expected_row_overlap_inv, run_flag, completion_status, chunk_counter, chunk_pointer_pos, semiring);
+	HANDLE_ERROR(cudaGetLastError());
+
+}
+
+
+#define GPUCompressedMatrixMatrixMultiplyGEMM(TYPE, THREADS, BLOCKS_PER_MP, NNZPERTHREAD, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS) \
+	template void h_computeSpgemmPart<NNZPERTHREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_PATH_OPTIONS, TYPE, TYPE, TYPE, uint32_t, uint32_t, 0> \
+	(const TYPE* valA, const uint32_t* indicesA, const uint32_t* __restrict offsetsA, \
+	const TYPE* __restrict valB, const uint32_t* __restrict indicesB, const uint32_t* __restrict offsetsB, \
+	const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows,\
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size, \
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \
+	uint32_t* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count,\
+	uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv, \
+	uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos); \
+	template void h_computeSpgemmPart<NNZPERTHREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_PATH_OPTIONS, TYPE, TYPE, TYPE, uint32_t, uint32_t, 1> \
+	(const TYPE* valA, const uint32_t* indicesA, const uint32_t* __restrict offsetsA, \
+	const TYPE* __restrict valB, const uint32_t* __restrict indicesB, const uint32_t* __restrict offsetsB, \
+	const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows, \
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size, \
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \
+	uint32_t* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count, \
+	uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv, \
+	uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos); \
+	template void h_computeSpgemmPart<NNZPERTHREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_PATH_OPTIONS, TYPE, TYPE, TYPE, uint32_t, uint32_t, 2> \
+	(const TYPE* valA, const uint32_t* indicesA, const uint32_t* __restrict offsetsA, \
+	const TYPE* __restrict valB, const uint32_t* __restrict indicesB, const uint32_t* __restrict offsetsB, \
+	const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows, \
+	uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size, \
+	void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \
+	uint32_t* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count, \
+	uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv, \
+	uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos);
diff --git a/include/GALATIC/include/device/consistent_gpu_memory.h b/include/GALATIC/include/device/consistent_gpu_memory.h
new file mode 100644
index 00000000..78392d90
--- /dev/null
+++ b/include/GALATIC/include/device/consistent_gpu_memory.h
@@ -0,0 +1,93 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include <utility>
+#include "../../include/devicetools/memory.h"
+#include "../memory_space.h"
+#include "../consistent_memory.h"
+
+namespace ACSpGEMM {
+	template<>
+	class ConsistentMemory<MemorySpace::device> : RegisteredMemory
+	{
+		size_t _size;
+		CU::unique_ptr _ptr;
+
+		size_t clear() override
+		{
+			auto s = _size;
+			reset(0);
+			return s;
+		}
+	public:
+		ConsistentMemory() : _size(0)
+		{
+			register_consistent_memory(this);
+		}
+
+		~ConsistentMemory()
+		{
+			unregister_consistent_memory(this);
+		}
+
+		operator CUdeviceptr() const noexcept { return _ptr; }
+
+		template <typename T = void>
+		T* get() const noexcept { return reinterpret_cast<T*>(_ptr.operator long long unsigned int()); }
+
+		void increaseMemRetainData(size_t size)
+		{
+			CU::unique_ptr tmp_ptr = CU::allocMemory(_size + size);
+			cudaMemcpy(tmp_ptr.get(), _ptr.get(), _size, cudaMemcpyDeviceToDevice);
+			_ptr.reset();
+			_ptr = std::move(tmp_ptr);
+			_size += size;
+		}
+
+		void assure(size_t size)
+		{
+			if (size > _size)
+			{
+				_ptr.reset();
+				_ptr = CU::allocMemory(size);
+				_size = size;
+			}
+		}
+		void reset(size_t size = 0)
+		{
+			_ptr.reset();
+			_size = 0;
+			assure(size);
+		}
+	};
+}
diff --git a/include/GALATIC/include/devicetools/consistent_memory.h b/include/GALATIC/include/devicetools/consistent_memory.h
new file mode 100644
index 00000000..6a2c30df
--- /dev/null
+++ b/include/GALATIC/include/devicetools/consistent_memory.h
@@ -0,0 +1,112 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include <vector>
+#include "CUDATools/memory_space.h"
+
+namespace HiSparse
+{
+	namespace Detail
+	{
+		class RegisteredMemory
+		{
+		public:
+			virtual size_t clear() = 0;
+		};
+	
+		std::vector<RegisteredMemory*>& getRegMemories()
+		{
+			static std::vector<RegisteredMemory*> m;
+			return m;
+		}
+		
+		void register_consistent_memory(RegisteredMemory* memory)
+		{
+			getRegMemories().push_back(memory);
+		}
+		void unregister_consistent_memory(RegisteredMemory* memory)
+		{
+			auto &m = getRegMemories();
+			std::remove(begin(m), end(m), memory);
+		}
+		size_t clear_consistentMemory()
+		{
+			size_t s = 0;
+			for (auto m : getRegMemories())
+				s += m->clear();
+			return s;
+		}
+
+		template<MemorySpace>
+		class ConsistentMemory;
+
+		template<class T>
+		class RegisteredMemoryVar : RegisteredMemory
+		{
+			T v;
+			size_t clear() override
+			{
+				v = 0;
+				return 0;
+			}
+		public:
+			RegisteredMemoryVar() : v(0)
+			{
+				register_consistent_memory(this);
+			}
+			explicit RegisteredMemoryVar(T v) : v(v)
+			{
+				register_consistent_memory(this);
+			}
+			~RegisteredMemoryVar()
+			{
+				unregister_consistent_memory(this);
+			}
+
+			RegisteredMemoryVar& operator+= (T add)
+			{
+				v += add;
+				return *this;
+			}
+
+			void operator = (T other)
+			{
+				v = other;
+			}
+			operator T() const noexcept
+			{
+				return v;
+			}
+		};
+	}
+}
diff --git a/include/GALATIC/include/devicetools/error.h b/include/GALATIC/include/devicetools/error.h
new file mode 100644
index 00000000..803e2922
--- /dev/null
+++ b/include/GALATIC/include/devicetools/error.h
@@ -0,0 +1,297 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+
+#ifndef INCLUDED_CUDA_ERROR
+#define INCLUDED_CUDA_ERROR
+
+#pragma once
+
+#include <type_traits>
+#include <exception>
+
+#include <cuda.h>
+
+
+namespace CU
+{
+	template <CUresult error_code, typename = void>
+	struct error_traits;
+
+	template <CUresult error_code>
+	class basic_error : public error_traits<error_code>::category
+	{
+	public:
+		virtual CUresult code() const noexcept override;
+		virtual const char* name() const noexcept override;
+		const char* what() const noexcept override;
+	};
+	
+
+	class error : public std::exception
+	{
+	public:
+		virtual CUresult code() const noexcept = 0;
+		virtual const char* name() const noexcept = 0;
+
+		using invalid_value = basic_error<CUDA_ERROR_INVALID_VALUE>;
+		using out_of_memory = basic_error<CUDA_ERROR_OUT_OF_MEMORY>;
+		using not_initialized = basic_error<CUDA_ERROR_NOT_INITIALIZED>;
+		using deinitialized = basic_error<CUDA_ERROR_DEINITIALIZED>;
+		using profiler_disabled = basic_error<CUDA_ERROR_PROFILER_DISABLED>;
+		//using profiler_not_initialized = basic_error<CUDA_ERROR_PROFILER_NOT_INITIALIZED>;
+		//using profiler_already_started = basic_error<CUDA_ERROR_PROFILER_ALREADY_STARTED>;
+		//using profiler_already_stopped = basic_error<CUDA_ERROR_PROFILER_ALREADY_STOPPED>;
+		using no_device = basic_error<CUDA_ERROR_NO_DEVICE>;
+		using invalid_device = basic_error<CUDA_ERROR_INVALID_DEVICE>;
+		using invalid_image = basic_error<CUDA_ERROR_INVALID_IMAGE>;
+		using invalid_context = basic_error<CUDA_ERROR_INVALID_CONTEXT>;
+		//using context_already_current = basic_error<CUDA_ERROR_CONTEXT_ALREADY_CURRENT>;
+		using map_failed = basic_error<CUDA_ERROR_MAP_FAILED>;
+		using unmap_failed = basic_error<CUDA_ERROR_UNMAP_FAILED>;
+		using array_is_mapped = basic_error<CUDA_ERROR_ARRAY_IS_MAPPED>;
+		using already_mapped = basic_error<CUDA_ERROR_ALREADY_MAPPED>;
+		using no_binary_for_gpu = basic_error<CUDA_ERROR_NO_BINARY_FOR_GPU>;
+		using already_acquired = basic_error<CUDA_ERROR_ALREADY_ACQUIRED>;
+		using not_mapped = basic_error<CUDA_ERROR_NOT_MAPPED>;
+		using not_mapped_as_array = basic_error<CUDA_ERROR_NOT_MAPPED_AS_ARRAY>;
+		using not_mapped_as_pointer = basic_error<CUDA_ERROR_NOT_MAPPED_AS_POINTER>;
+		using ecc_uncorrectable = basic_error<CUDA_ERROR_ECC_UNCORRECTABLE>;
+		using unsupported_limit = basic_error<CUDA_ERROR_UNSUPPORTED_LIMIT>;
+		using context_already_in_use = basic_error<CUDA_ERROR_CONTEXT_ALREADY_IN_USE>;
+		using peer_access_unsupported = basic_error<CUDA_ERROR_PEER_ACCESS_UNSUPPORTED>;
+		using invalid_ptx = basic_error<CUDA_ERROR_INVALID_PTX>;
+		using invalid_graphics_context = basic_error<CUDA_ERROR_INVALID_GRAPHICS_CONTEXT>;
+		using nvlink_uncorrectable = basic_error<CUDA_ERROR_NVLINK_UNCORRECTABLE>;
+		using jit_not_found = basic_error<CUDA_ERROR_JIT_COMPILER_NOT_FOUND>;
+		using invalid_source = basic_error<CUDA_ERROR_INVALID_SOURCE>;
+		using file_not_found = basic_error<CUDA_ERROR_FILE_NOT_FOUND>;
+		using shared_object_symbol_not_found = basic_error<CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND>;
+		using shared_object_init_failed = basic_error<CUDA_ERROR_SHARED_OBJECT_INIT_FAILED>;
+		using operating_system = basic_error<CUDA_ERROR_OPERATING_SYSTEM>;
+		using invalid_handle = basic_error<CUDA_ERROR_INVALID_HANDLE>;
+		using not_found = basic_error<CUDA_ERROR_NOT_FOUND>;
+		using not_ready = basic_error<CUDA_ERROR_NOT_READY>;
+		using illegal_address = basic_error<CUDA_ERROR_ILLEGAL_ADDRESS>;
+		using launch_out_of_resources = basic_error<CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES>;
+		using launch_timeout = basic_error<CUDA_ERROR_LAUNCH_TIMEOUT>;
+		using launch_incompatible_texturing = basic_error<CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING>;
+		using peer_access_already_enabled = basic_error<CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED>;
+		using peer_access_not_enabled = basic_error<CUDA_ERROR_PEER_ACCESS_NOT_ENABLED>;
+		using primary_context_active = basic_error<CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE>;
+		using context_is_destroyed = basic_error<CUDA_ERROR_CONTEXT_IS_DESTROYED>;
+		using assertion_failed = basic_error<CUDA_ERROR_ASSERT>;
+		using too_many_peers = basic_error<CUDA_ERROR_TOO_MANY_PEERS>;
+		using host_memory_already_registered = basic_error<CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED>;
+		using host_memory_not_registered = basic_error<CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED>;
+		using hardware_stack_error = basic_error<CUDA_ERROR_HARDWARE_STACK_ERROR>;
+		using illegal_instruction = basic_error<CUDA_ERROR_ILLEGAL_INSTRUCTION>;
+		using misaligned_address = basic_error<CUDA_ERROR_MISALIGNED_ADDRESS>;
+		using invalid_address_space = basic_error<CUDA_ERROR_INVALID_ADDRESS_SPACE>;
+		using invalid_pc = basic_error<CUDA_ERROR_INVALID_PC>;
+		using launch_failed = basic_error<CUDA_ERROR_LAUNCH_FAILED>;
+		using cooperative_launch_too_large = basic_error<CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE>;
+		using not_permitted = basic_error<CUDA_ERROR_NOT_PERMITTED>;
+		using not_supported = basic_error<CUDA_ERROR_NOT_SUPPORTED>;
+		using unknown = basic_error<CUDA_ERROR_UNKNOWN>;
+	};
+
+	class logic_error : public error {};
+	class runtime_error : public error {};
+	class fatal_error : public runtime_error {};
+	class bad_alloc : public error {};
+
+
+	template <CUresult error_code, typename>
+	struct error_traits
+	{
+		using category = logic_error;
+	};
+
+	template <CUresult error_code>
+	struct error_traits<error_code, std::enable_if_t<
+		error_code == CUDA_ERROR_PROFILER_DISABLED ||
+		error_code == CUDA_ERROR_MAP_FAILED ||
+		error_code == CUDA_ERROR_UNMAP_FAILED ||
+		error_code == CUDA_ERROR_ECC_UNCORRECTABLE ||
+		error_code == CUDA_ERROR_NVLINK_UNCORRECTABLE ||
+		error_code == CUDA_ERROR_JIT_COMPILER_NOT_FOUND ||
+		error_code == CUDA_ERROR_OPERATING_SYSTEM
+		>>
+	{
+		using category = runtime_error;
+	};
+
+	template <CUresult error_code>
+	struct error_traits<error_code, std::enable_if_t<
+		error_code == CUDA_ERROR_ILLEGAL_ADDRESS ||
+		error_code == CUDA_ERROR_LAUNCH_TIMEOUT ||
+		error_code == CUDA_ERROR_ASSERT ||
+		error_code == CUDA_ERROR_HARDWARE_STACK_ERROR ||
+		error_code == CUDA_ERROR_ILLEGAL_INSTRUCTION ||
+		error_code == CUDA_ERROR_MISALIGNED_ADDRESS ||
+		error_code == CUDA_ERROR_INVALID_ADDRESS_SPACE ||
+		error_code == CUDA_ERROR_INVALID_PC ||
+		error_code == CUDA_ERROR_LAUNCH_FAILED ||
+		error_code == CUDA_ERROR_UNKNOWN
+		>>
+	{
+		using category = fatal_error;
+	};
+
+	template <CUresult error_code>
+	struct error_traits<error_code, std::enable_if_t<
+		error_code == CUDA_ERROR_OUT_OF_MEMORY
+		>>
+	{
+		using category = bad_alloc;
+	};
+
+	extern template class basic_error<CUDA_ERROR_INVALID_VALUE>;
+	extern template class basic_error<CUDA_ERROR_OUT_OF_MEMORY>;
+	extern template class basic_error<CUDA_ERROR_NOT_INITIALIZED>;
+	extern template class basic_error<CUDA_ERROR_DEINITIALIZED>;
+	extern template class basic_error<CUDA_ERROR_PROFILER_DISABLED>;
+	extern template class basic_error<CUDA_ERROR_PROFILER_NOT_INITIALIZED>;
+	extern template class basic_error<CUDA_ERROR_PROFILER_ALREADY_STARTED>;
+	extern template class basic_error<CUDA_ERROR_PROFILER_ALREADY_STOPPED>;
+	extern template class basic_error<CUDA_ERROR_NO_DEVICE>;
+	extern template class basic_error<CUDA_ERROR_INVALID_DEVICE>;
+	extern template class basic_error<CUDA_ERROR_INVALID_IMAGE>;
+	extern template class basic_error<CUDA_ERROR_INVALID_CONTEXT>;
+	extern template class basic_error<CUDA_ERROR_CONTEXT_ALREADY_CURRENT>;
+	extern template class basic_error<CUDA_ERROR_MAP_FAILED>;
+	extern template class basic_error<CUDA_ERROR_UNMAP_FAILED>;
+	extern template class basic_error<CUDA_ERROR_ARRAY_IS_MAPPED>;
+	extern template class basic_error<CUDA_ERROR_ALREADY_MAPPED>;
+	extern template class basic_error<CUDA_ERROR_NO_BINARY_FOR_GPU>;
+	extern template class basic_error<CUDA_ERROR_ALREADY_ACQUIRED>;
+	extern template class basic_error<CUDA_ERROR_NOT_MAPPED>;
+	extern template class basic_error<CUDA_ERROR_NOT_MAPPED_AS_ARRAY>;
+	extern template class basic_error<CUDA_ERROR_NOT_MAPPED_AS_POINTER>;
+	extern template class basic_error<CUDA_ERROR_ECC_UNCORRECTABLE>;
+	extern template class basic_error<CUDA_ERROR_UNSUPPORTED_LIMIT>;
+	extern template class basic_error<CUDA_ERROR_CONTEXT_ALREADY_IN_USE>;
+	extern template class basic_error<CUDA_ERROR_PEER_ACCESS_UNSUPPORTED>;
+	extern template class basic_error<CUDA_ERROR_INVALID_PTX>;
+	extern template class basic_error<CUDA_ERROR_INVALID_GRAPHICS_CONTEXT>;
+	extern template class basic_error<CUDA_ERROR_NVLINK_UNCORRECTABLE>;
+	extern template class basic_error<CUDA_ERROR_JIT_COMPILER_NOT_FOUND>;
+	extern template class basic_error<CUDA_ERROR_INVALID_SOURCE>;
+	extern template class basic_error<CUDA_ERROR_FILE_NOT_FOUND>;
+	extern template class basic_error<CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND>;
+	extern template class basic_error<CUDA_ERROR_SHARED_OBJECT_INIT_FAILED>;
+	extern template class basic_error<CUDA_ERROR_OPERATING_SYSTEM>;
+	extern template class basic_error<CUDA_ERROR_INVALID_HANDLE>;
+	extern template class basic_error<CUDA_ERROR_NOT_FOUND>;
+	extern template class basic_error<CUDA_ERROR_NOT_READY>;
+	extern template class basic_error<CUDA_ERROR_ILLEGAL_ADDRESS>;
+	extern template class basic_error<CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES>;
+	extern template class basic_error<CUDA_ERROR_LAUNCH_TIMEOUT>;
+	extern template class basic_error<CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING>;
+	extern template class basic_error<CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED>;
+	extern template class basic_error<CUDA_ERROR_PEER_ACCESS_NOT_ENABLED>;
+	extern template class basic_error<CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE>;
+	extern template class basic_error<CUDA_ERROR_CONTEXT_IS_DESTROYED>;
+	extern template class basic_error<CUDA_ERROR_ASSERT>;
+	extern template class basic_error<CUDA_ERROR_TOO_MANY_PEERS>;
+	extern template class basic_error<CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED>;
+	extern template class basic_error<CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED>;
+	extern template class basic_error<CUDA_ERROR_HARDWARE_STACK_ERROR>;
+	extern template class basic_error<CUDA_ERROR_ILLEGAL_INSTRUCTION>;
+	extern template class basic_error<CUDA_ERROR_MISALIGNED_ADDRESS>;
+	extern template class basic_error<CUDA_ERROR_INVALID_ADDRESS_SPACE>;
+	extern template class basic_error<CUDA_ERROR_INVALID_PC>;
+	extern template class basic_error<CUDA_ERROR_LAUNCH_FAILED>;
+	extern template class basic_error<CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE>;
+	extern template class basic_error<CUDA_ERROR_NOT_PERMITTED>;
+	extern template class basic_error<CUDA_ERROR_NOT_SUPPORTED>;
+	extern template class basic_error<CUDA_ERROR_UNKNOWN>;
+
+
+	class unknown_error_code : public error
+	{
+		CUresult error_code;
+
+	public:
+		unknown_error_code(CUresult error_code);
+
+		CUresult code() const noexcept override;
+		const char* name() const noexcept override;
+		const char* what() const noexcept override;
+	};
+
+
+	class unexpected_result : public error
+	{
+		CUresult result;
+
+	public:
+		unexpected_result(CUresult result);
+
+		CUresult code() const noexcept override;
+		const char* name() const noexcept override;
+		const char* what() const noexcept override;
+	};
+
+
+	CUresult throw_error(CUresult result);
+
+	inline CUresult succeed(CUresult result)
+	{
+		if (result != CUDA_SUCCESS)
+			throw unknown_error_code(throw_error(result));
+		return result;
+	}
+
+
+	template <CUresult expected>
+	inline CUresult expect(CUresult result)
+	{
+		if (result != expected)
+			throw unexpected_result(result);
+		return result;
+	}
+
+	template <CUresult expected_1, CUresult expected_2, CUresult... expected>
+	inline CUresult expect(CUresult result)
+	{
+		if (result != expected_1)
+			return expect<expected_2, expected...>(result);
+		return result;
+	}
+}
+
+using CU::throw_error;
+using CU::succeed;
+using CU::expect;
+
+#endif  // INCLUDED_CUDA_ERROR
diff --git a/include/GALATIC/include/devicetools/event.h b/include/GALATIC/include/devicetools/event.h
new file mode 100644
index 00000000..faceecdc
--- /dev/null
+++ b/include/GALATIC/include/devicetools/event.h
@@ -0,0 +1,58 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+
+#ifndef INCLUDED_CUDA_EVENT
+#define INCLUDED_CUDA_EVENT
+
+#pragma once
+
+#include <cuda.h>
+
+#include "unique_handle.h"
+
+
+namespace CU
+{
+	struct EventDestroyDeleter
+	{
+		void operator ()(CUevent event) const
+		{
+			cuEventDestroy(event);
+		}
+	};
+	
+	using unique_event = unique_handle<CUevent, nullptr, EventDestroyDeleter>;
+	
+	unique_event createEvent(unsigned int flags = CU_EVENT_DEFAULT);
+}
+
+#endif  // INCLUDED_CUDA_EVENT
diff --git a/include/GALATIC/include/devicetools/memory.h b/include/GALATIC/include/devicetools/memory.h
new file mode 100644
index 00000000..c5a7c13b
--- /dev/null
+++ b/include/GALATIC/include/devicetools/memory.h
@@ -0,0 +1,95 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+
+#ifndef INCLUDED_CUDA_MEMORY
+#define INCLUDED_CUDA_MEMORY
+
+#pragma once
+
+#include <cstddef>
+
+#include <cuda_runtime.h>
+
+#include "../../include/devicetools/unique_handle.h"
+
+
+namespace CU
+{
+	struct MemFreeDeleter
+	{
+		void operator ()(CUdeviceptr ptr) const
+		{
+			cudaFree(reinterpret_cast<void*>(ptr));
+		}
+	};
+	
+	using unique_ptr = unique_handle<CUdeviceptr, 0ULL, MemFreeDeleter>;
+	
+	
+	struct pitched_memory
+	{
+		pitched_memory(const pitched_memory&) = delete;
+		pitched_memory& operator =(const pitched_memory&) = delete;
+		
+		unique_ptr memory;
+		std::size_t pitch;
+		
+		pitched_memory() {}
+		
+		pitched_memory(unique_ptr memory, std::size_t pitch)
+			: memory(std::move(memory)),
+			  pitch(pitch)
+		{
+		}
+		
+		pitched_memory(pitched_memory&& m)
+			: memory(std::move(m.memory)),
+			  pitch(m.pitch)
+		{
+		}
+		
+		pitched_memory& operator =(pitched_memory&& m)
+		{
+			using std::swap;
+			swap(memory, m.memory);
+			pitch = m.pitch;
+			return *this;
+		}
+	};
+	
+	
+	unique_ptr allocMemory(std::size_t size);
+	unique_ptr allocMemoryPitched(std::size_t& pitch, std::size_t row_size, std::size_t num_rows, unsigned int element_size);
+	pitched_memory allocMemoryPitched(std::size_t row_size, std::size_t num_rows, unsigned int element_size);
+}
+
+#endif  // INCLUDED_CUDA_MEMORY
diff --git a/include/GALATIC/include/devicetools/memory_space.h b/include/GALATIC/include/devicetools/memory_space.h
new file mode 100644
index 00000000..3e5aeb4d
--- /dev/null
+++ b/include/GALATIC/include/devicetools/memory_space.h
@@ -0,0 +1,41 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+namespace HiSparse
+{
+	enum class MemorySpace
+	{
+		host,
+		device
+	};
+}
\ No newline at end of file
diff --git a/include/GALATIC/include/devicetools/stream.h b/include/GALATIC/include/devicetools/stream.h
new file mode 100644
index 00000000..f570457c
--- /dev/null
+++ b/include/GALATIC/include/devicetools/stream.h
@@ -0,0 +1,59 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+
+#ifndef INCLUDED_CUDA_STREAM
+#define INCLUDED_CUDA_STREAM
+
+#pragma once
+
+#include <cuda.h>
+
+#include "../../include/devicetools/unique_handle.h"
+
+
+
+namespace CU
+{
+	struct StreamDestroyDeleter
+	{
+		void operator ()(CUstream stream) const
+		{
+			cuStreamDestroy(stream);
+		}
+	};
+	
+	using unique_stream = unique_handle<CUstream, nullptr, StreamDestroyDeleter>;
+	
+	unique_stream createStream(unsigned int flags = CU_STREAM_DEFAULT);
+}
+
+#endif  // INCLUDED_CUDA_STREAM
diff --git a/include/GALATIC/include/devicetools/unique_handle.h b/include/GALATIC/include/devicetools/unique_handle.h
new file mode 100644
index 00000000..32ef72a2
--- /dev/null
+++ b/include/GALATIC/include/devicetools/unique_handle.h
@@ -0,0 +1,132 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+
+#ifndef INCLUDED_CUDA_UNIQUE_HANDLE
+#define INCLUDED_CUDA_UNIQUE_HANDLE
+
+#pragma once
+
+#include <utility>
+
+
+namespace CU
+{
+	template <typename T, T NULL_VALUE, typename Deleter>
+	class unique_handle : Deleter
+	{
+		T h;
+		
+		void free(T handle) noexcept
+		{
+			if (handle != NULL_VALUE)
+				Deleter::operator ()(handle);
+		}
+		
+	public:
+		unique_handle(const unique_handle&) = delete;
+		unique_handle& operator =(const unique_handle&) = delete;
+		
+		using handle_type = T;
+		using deleter_type = Deleter;
+		
+		static constexpr T null_value = NULL_VALUE;
+		
+		explicit unique_handle(T handle = NULL_VALUE) noexcept
+			: h(handle)
+		{
+		}
+
+		void consume(T handle) noexcept { h = handle; }
+
+		
+		unique_handle(T handle, const Deleter& d) noexcept
+			: Deleter(d),
+			  h(handle)
+		{
+		}
+		
+		unique_handle(T handle, Deleter&& d) noexcept
+			: Deleter(std::move(d)),
+			  h(handle)
+		{
+		}
+		
+		unique_handle(unique_handle&& h) noexcept
+			: Deleter(std::move(static_cast<Deleter&&>(h))),
+			  h(h.h)
+		{
+			h.h = NULL_VALUE;
+		}
+		
+		~unique_handle()
+		{
+			free(h);
+		}
+		
+		operator T() const noexcept { return h; }
+
+		template <typename DataType = void>
+		DataType* get() const noexcept { return reinterpret_cast<DataType*>(h); }
+
+		template <typename DataType = void>
+		DataType* getRelease() noexcept { DataType* tmp = reinterpret_cast<DataType*>(h); h = 0ULL; return tmp; }
+		
+		unique_handle& operator =(unique_handle&& h) noexcept
+		{
+			using std::swap;
+			swap(*this, h);
+			return *this;
+		}
+		
+		T release() noexcept
+		{
+			T temp = h;
+			h = NULL_VALUE;
+			return temp;
+		}
+		
+		void reset(T handle = null_value) noexcept
+		{
+			using std::swap;
+			swap(this->h, handle);
+			free(handle);
+		}
+		
+		friend void swap(unique_handle& a, unique_handle& b) noexcept
+		{
+			using std::swap;
+			swap(a.h, b.h);
+		}
+	};
+}
+
+#endif  // INCLUDED_CUDA_UNIQUE_HANDLE
diff --git a/include/GALATIC/include/execution_stats.h b/include/GALATIC/include/execution_stats.h
new file mode 100644
index 00000000..4fb6df11
--- /dev/null
+++ b/include/GALATIC/include/execution_stats.h
@@ -0,0 +1,159 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#include <stdint.h>
+#include <iostream>
+
+struct ExecutionStats
+{
+	//timings
+	bool measure_all;
+	float duration;
+	float duration_blockstarts;
+	float duration_spgemm;
+	float duration_merge_case_computation;
+	float duration_merge_simple;
+	float duration_merge_max;
+	float duration_merge_generalized;
+	float duration_write_csr;
+
+
+	//merge cases
+	uint32_t shared_rows;
+	uint32_t simple_mergers;
+	uint32_t simple_rows;
+	uint32_t complex_rows;
+	uint32_t generalized_rows;
+
+	//memory consumption
+	size_t mem_allocated_chunks;
+	size_t mem_used_chunks;
+	size_t mem_clear_return;
+
+	//misc
+	size_t restarts;
+	int called{ 0 };
+	friend std::ostream& operator<<(std::ostream&, const ExecutionStats&);
+
+	ExecutionStats() : measure_all(false),
+		duration(0), duration_blockstarts(0), duration_spgemm(0), duration_merge_case_computation(0),
+		duration_merge_simple(0), duration_merge_max(0), duration_merge_generalized(0), duration_write_csr(0),		
+		shared_rows(0), simple_mergers(0), simple_rows(0), complex_rows(0), generalized_rows(0),
+		mem_allocated_chunks(0), mem_used_chunks(), mem_clear_return(0),
+		restarts(0) { }
+
+	ExecutionStats& operator+=(const ExecutionStats& stats)
+	{
+		this->duration += stats.duration;
+		this->duration_blockstarts += stats.duration_blockstarts;
+		this->duration_spgemm += stats.duration_spgemm;
+		this->duration_merge_case_computation += stats.duration_merge_case_computation;
+		this->duration_merge_simple += stats.duration_merge_simple;
+		this->duration_merge_max += stats.duration_merge_max;
+		this->duration_merge_generalized += stats.duration_merge_generalized;
+		this->duration_write_csr += stats.duration_write_csr;
+		this->shared_rows += stats.shared_rows;
+		this->simple_mergers += stats.simple_mergers;
+		this->simple_rows += stats.simple_rows;
+		this->complex_rows += stats.complex_rows;
+		this->generalized_rows += stats.generalized_rows;
+		this->mem_allocated_chunks += stats.mem_allocated_chunks;
+		this->mem_used_chunks += stats.mem_used_chunks;
+		this->mem_clear_return += stats.mem_clear_return;
+		this->restarts += stats.restarts;
+		++called;
+		// printf("Overall: %f and added up: %f\n", stats.duration, (stats.duration_blockstarts + stats.duration_spgemm + stats.duration_merge_case_computation +
+		// 	stats.duration_merge_simple + stats.duration_merge_max + stats.duration_merge_generalized + stats.duration_write_csr));
+		return *this;
+	}
+
+	void reset()
+	{
+		this->duration = 0.0f;
+		this->duration_blockstarts = 0.0f;
+		this->duration_spgemm = 0.0f;
+		this->duration_merge_case_computation = 0.0f;
+		this->duration_merge_simple = 0.0f;
+		this->duration_merge_max = 0.0f;
+		this->duration_merge_generalized = 0.0f;
+		this->duration_write_csr = 0.0f;
+		this->shared_rows = 0;
+		this->simple_mergers = 0;
+		this->simple_rows = 0;
+		this->complex_rows = 0;
+		this->generalized_rows = 0;
+		this->mem_allocated_chunks = 0;
+		this->mem_used_chunks = 0;
+		this->mem_clear_return = 0;
+		this->restarts = 0;
+	}
+
+	void normalize()
+	{
+		if (called)
+		{
+			float division_factor = static_cast<float>(called);
+			this->duration /= division_factor;
+			this->duration_blockstarts /= division_factor;
+			this->duration_spgemm /= division_factor;
+			this->duration_merge_case_computation /= division_factor;
+			this->duration_merge_simple /= division_factor;
+			this->duration_merge_max /= division_factor;
+			this->duration_merge_generalized /= division_factor;
+			this->duration_write_csr /= division_factor;
+			this->shared_rows /= called;
+			this->simple_mergers /= called;
+			this->simple_rows /= called;
+			this->complex_rows /= called;
+			this->generalized_rows /= called;
+			this->mem_allocated_chunks /= called;
+			this->mem_used_chunks /= called;
+			this->mem_clear_return /= called;
+			this->restarts /= called;
+		}
+	}
+};
+
+inline std::ostream& operator<<(std::ostream& os, const ExecutionStats& obj) {
+	os << "Overall Duration: " << obj.duration << " ms\n";
+	os << "Restarts: " << obj.restarts << std::endl;
+	if (obj.measure_all)
+	{
+		os << "Sum individual timings: " << obj.duration_blockstarts + obj.duration_spgemm + obj.duration_merge_case_computation + obj.duration_merge_simple + obj.duration_merge_max + obj.duration_merge_generalized + obj.duration_write_csr << " ms\n";
+		os << std::string("Duration BlockStarts: ") << obj.duration_blockstarts << " ms | Duration SpGEMM: " << obj.duration_spgemm << " ms\n";
+		os << "Duration MergeCase: " << obj.duration_merge_case_computation << " ms | Duration Merge Simple: " << obj.duration_merge_simple << " ms\n";
+		os << "Duration Merge Max: " << obj.duration_merge_max << " ms | Duration Merge Generalized: " << obj.duration_merge_generalized << " ms\n";
+		os << "Duration Merge Write CSR: " << obj.duration_write_csr << " ms\n";
+	}
+	return os;
+}
diff --git a/include/GALATIC/include/memory_space.h b/include/GALATIC/include/memory_space.h
new file mode 100644
index 00000000..86e4816b
--- /dev/null
+++ b/include/GALATIC/include/memory_space.h
@@ -0,0 +1,39 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+
+enum class MemorySpace
+{
+	host,
+	device
+};
diff --git a/include/GALATIC/include/meta_utils.h b/include/GALATIC/include/meta_utils.h
new file mode 100644
index 00000000..623759d3
--- /dev/null
+++ b/include/GALATIC/include/meta_utils.h
@@ -0,0 +1,274 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+
+#ifndef INCLUDED_HIS_META_UTILS
+#define INCLUDED_HIS_META_UTILS
+
+#pragma once
+
+#include <utility>
+#include <type_traits>
+#include "multi_arch_build.h"
+
+
+	using std::enable_if;
+	using std::declval;
+	using std::is_empty;
+	using std::conditional;
+
+	template <class A, class B>
+	struct type_match
+	{
+		static const bool value = false;
+	};
+
+	template <class A>
+	struct type_match<A, A>
+	{
+		static const bool value = true;
+	};
+
+	template<int X, int Y>
+	struct static_divup
+	{
+		static const int value = (X + Y - 1) / Y;
+	};
+
+	template<int X>
+	struct static_popcnt
+	{
+		static const int value = ((X & 0x1) + static_popcnt< (X >> 1) >::value);
+	};
+	template<>
+	struct static_popcnt<0>
+	{
+		static const int value = 0;
+	};
+
+	template<unsigned int X, int Completed = 0>
+	struct static_clz
+	{
+		static const int value = (X & 0x80000000) ? Completed : static_clz< (X << 1), Completed + 1 >::value;
+	};
+	template<unsigned int X>
+	struct static_clz<X, 32>
+	{
+		static const int value = 32;
+	};
+
+	template<int... VALUES>
+	struct static_max;
+
+	template<int VALUE>
+	struct static_max<VALUE>
+	{
+		static const int value = VALUE;
+	};
+
+	template<int VALUE, int... VALUES>
+	struct static_max<VALUE, VALUES...>
+	{
+		static const int next_value = static_max<VALUES...>::value;
+		static const int value = VALUE > next_value ? VALUE : next_value;
+	};
+
+	template<int... VALUES>
+	struct static_min;
+
+	template<int VALUE>
+	struct static_min<VALUE>
+	{
+		static const int value = VALUE;
+	};
+
+	template<int VALUE, int... VALUES>
+	struct static_min<VALUE, VALUES...>
+	{
+		static const int next_value = static_min<VALUES...>::value;
+		static const int value = VALUE < next_value ? VALUE : next_value;
+	};
+
+	template<int I, class... NCS>
+	struct choose;
+
+	template<int I, class NC, class... NCS>
+	struct choose<I, NC, NCS...>
+	{
+		typedef typename choose<I - 1, NCS...>::type type;
+	};
+	template<class NC, class... NCS>
+	struct choose<0, NC, NCS...>
+	{
+		typedef NC type;
+	};
+
+
+	template<bool COND>
+	struct conditional_eval;
+
+	template<>
+	struct conditional_eval<true>
+	{
+		template<class F>
+		DUAL_BUILD_FUNCTION static void eval(F f)
+		{
+			f();
+		}
+	};
+	template<>
+	struct conditional_eval<false>
+	{
+		template<class F>
+		DUAL_BUILD_FUNCTION static void eval(F f)
+		{
+		}
+	};
+
+	template<template<int...> class CONSUMER, int V, int END, int STEP, bool DONE, int... VALUES>
+	struct static_for_impl
+	{
+		using type = typename static_for_impl < CONSUMER, V+STEP, END, STEP, (V + STEP < END), VALUES..., V>::type;
+	};
+	template<template<int...> class CONSUMER, int V, int END, int STEP, int... VALUES>
+	struct static_for_impl<CONSUMER, V, END, STEP, false, VALUES...>
+	{
+		using type = CONSUMER <VALUES...>;
+	};
+
+	template<template<int...> class CONSUMER, int END, int BEGIN = 0, int STEP = 1>
+	struct static_for
+	{
+		using type = typename static_for_impl < CONSUMER, BEGIN, END, STEP, (BEGIN < END)>::type;
+	};
+
+
+	template<class...> 
+	struct type_list { };
+
+	template<template<class...> class APPLIER, class COMBLIST, class... TYPELISTS>
+	struct apply_list_impl;
+	template<template<class...> class APPLIER, class... DONETYPES, class... NEWTYPES, class... REMTYPELISTS>
+	struct apply_list_impl<APPLIER, type_list<DONETYPES...>, type_list<NEWTYPES...>, REMTYPELISTS...>
+	{
+		using type = typename apply_list_impl<APPLIER, type_list<DONETYPES..., NEWTYPES...>, REMTYPELISTS...>::type;
+	};
+	template<template<class...> class APPLIER, class... DONETYPES>
+	struct apply_list_impl<APPLIER, type_list<DONETYPES...>>
+	{
+		using type = APPLIER<DONETYPES...>;
+	};
+	template<template<class...> class APPLIER, class... TYPELISTS>
+	struct apply_list
+	{
+		using type = typename apply_list_impl<APPLIER, type_list<>, TYPELISTS... >::type;
+	};
+
+	template<class INVERSE_LIST, class FORWARD_LIST>
+	struct inverse_list_impl;
+	template<class... INVERSE_TYPES, class FIRST, class... REMAINING>
+	struct inverse_list_impl<type_list<INVERSE_TYPES...>, type_list<FIRST, REMAINING...>>
+	{
+		using type = typename inverse_list_impl<type_list<FIRST, INVERSE_TYPES...>, type_list<REMAINING...>>::type;
+	};
+	template<class INVERSE_LIST>
+	struct inverse_list_impl<INVERSE_LIST, type_list<>>
+	{
+		using type = INVERSE_LIST;
+	};
+	template<class TYPELIST>
+	struct inverse_list
+	{
+		using type = typename inverse_list_impl<type_list<>, TYPELIST>::type;
+	};
+
+
+	template<int... >
+	struct sequence { };
+
+	template<template<int...> class APPLIER, class SEQUENCE>
+	struct apply_sequence;
+	template<template<int...> class APPLIER, int... NUMS>
+	struct apply_sequence<APPLIER, sequence<NUMS...>>
+	{
+		using type = APPLIER<NUMS...>;
+	};
+
+	template<unsigned MASK, bool TAKE, class TAKEN_SEQUENCE, class REM_SEQUENCE>
+	struct select_from_impl;
+	template<unsigned MASK, int... TAKEN, int NUM, int... NUMS>
+	struct select_from_impl<MASK, true, sequence<TAKEN...>, sequence<NUM, NUMS...>>
+	{
+		using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence<TAKEN..., NUM>, sequence<NUMS...> > ::type;
+	};
+	template<unsigned MASK, int... TAKEN, int NUM, int... NUMS>
+	struct select_from_impl<MASK, false, sequence<TAKEN...>, sequence<NUM, NUMS...>>
+	{
+		using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence<TAKEN...>, sequence<NUMS...> > ::type;
+	};
+	template<unsigned MASK, bool TAKE, int... TAKEN>
+	struct select_from_impl<MASK, TAKE, sequence<TAKEN...>, sequence<>>
+	{
+		using type = sequence<TAKEN...>;
+	};
+	template<unsigned MASK, class SEQUENCE>
+	struct select_from
+	{
+		using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence<>, SEQUENCE > ::type;
+	};
+	
+
+	template<template<int> class LOGICAL, class SEQUENCE>
+	struct sequence_any;
+	template<template<int> class LOGICAL, int NUM, int...NUMS>
+	struct sequence_any<LOGICAL, sequence<NUM, NUMS...> >
+	{
+		static const bool value = LOGICAL<NUM>::value || sequence_any<LOGICAL, sequence<NUMS...>>::value;
+	};
+	template<template<int> class LOGICAL>
+	struct sequence_any<LOGICAL, sequence<> >
+	{
+		static const bool value = false;
+	};
+
+	template<int A>
+	struct static_is_zero
+	{
+		static const bool value = false;
+	};
+	template<>
+	struct static_is_zero<0>
+	{
+		static const bool value = true;
+	};
+
+
+#endif //INCLUDED_HIS_META_UTILS
diff --git a/include/GALATIC/include/multi_arch_build.h b/include/GALATIC/include/multi_arch_build.h
new file mode 100644
index 00000000..ba5f8747
--- /dev/null
+++ b/include/GALATIC/include/multi_arch_build.h
@@ -0,0 +1,45 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+#pragma once
+
+#ifdef __CUDACC__
+#define DUAL_BUILD_FUNCTION __host__ __device__
+#else
+#define DUAL_BUILD_FUNCTION 
+#endif
+
+#ifndef __CUDA_ARCH__
+inline float __uint_as_float(unsigned t)
+{
+	return *reinterpret_cast<float*>(&t);
+}
+#endif
diff --git a/include/GALATIC/include/performTestCase.cu b/include/GALATIC/include/performTestCase.cu
new file mode 100644
index 00000000..b614c302
--- /dev/null
+++ b/include/GALATIC/include/performTestCase.cu
@@ -0,0 +1,1019 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+* performTestCase.cpp
+*
+* ac-SpGEMM
+*
+* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+*------------------------------------------------------------------------------
+*/
+
+// Global includes
+#include <fstream>
+#include <iostream>
+#include <ctime>
+#include <iomanip>
+#include <string>
+#include <sstream> 
+#include <random>
+#include <algorithm>
+#include <cuda_runtime.h>
+#ifdef _WIN32
+#include <intrin.h>
+//surpress crash notification windows (close or debug program window)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#else
+#include <x86intrin.h>
+#endif
+
+// Local includes
+#include "dCSR.cuh"
+
+#include "CSR.cuh"
+#include "COO.cuh"
+#include "Vector.h"
+#include "dVector.h"
+#include "device/Multiply.cuh"
+#include "Transpose.h"
+#include "Compare.cuh"
+#include "consistent_memory.h"
+#include "CustomExceptions.h"
+#include "SemiRingInterface.h"
+
+#ifdef _WIN32
+#include <filesystem>
+#include <filesystem>
+#include <filesystem>
+using namespace std::filesystem;
+#else
+#include <experimental/filesystem>
+using namespace std::experimental::filesystem;
+#endif
+
+// CuSparse include
+#include "cusparse/include/cuSparseMultiply.h"
+
+// // Nsparse include
+// #include "nsparse/include/nsparseMultiply.h"
+
+// // RMerge include
+// #include "RMerge/include/rmergeMultiply.h"
+
+// // BhSparse include
+// #include"bhSparse/include/bhSparseMultiply.h"
+struct canonical {};
+
+struct testdouble :  SemiRing<double, double canonical>, {
+    static double multiply(double& a, double& b) { return a * b; }
+    static double add(double & a, double & b) { return a + b; }
+
+    static double MultiplicativeIdentity() {
+        return 1;
+    }
+    static double AdditiveIdentity() {
+        return 0;
+    }
+};
+
+
+unsigned int padding = 0;
+template<typename T>
+std::string typeext();
+template<>
+std::string typeext<float>()
+{
+	return std::string("");
+}
+template<>
+std::string typeext<double>()
+{
+	return std::string("d_");
+}
+
+template<typename Format>
+std::string nameextension()
+{
+	return "";
+}
+template<>
+std::string nameextension<double>()
+{
+	return "_d";
+}
+template<>
+std::string nameextension<float>()
+{
+	return "_f";
+}
+
+template<typename Format>
+bool isFloat()
+{
+	return false;
+}
+
+template<>
+bool isFloat<float>()
+{
+	return true;
+}
+
+// #################################################################
+//
+uint32_t numTrailingBinaryZeros(uint32_t n)
+{
+    uint32_t mask = 1;
+    for (uint32_t i = 0; i < 32; i++, mask <<= 1)
+        if ((n & mask) != 0)
+            return i;
+
+    return 32;
+}
+
+// #################################################################
+//
+void writeDetailedInfo(const ExecutionStats& stats, std::ofstream& out)
+{
+	out << stats.shared_rows << ";";
+	out << stats.simple_rows << ";";
+	out << stats.simple_mergers << ";";
+	out << stats.complex_rows << ";";
+	out << stats.generalized_rows << ";";
+	out << stats.duration << ";";
+	out << stats.duration_blockstarts << ";";
+	out << stats.duration_spgemm << ";";
+	out << stats.duration_merge_case_computation << ";";
+	out << stats.duration_merge_simple << ";";
+	out << stats.duration_merge_max << ";";
+	out << stats.duration_merge_generalized << ";";
+	out << stats.duration_write_csr << ";";
+	out << stats.mem_clear_return << ";";
+	out << stats.mem_allocated_chunks << ";";
+	out << stats.mem_used_chunks << ";";
+	out << stats.restarts << ";";
+	out << std::endl;
+}
+
+// #################################################################
+//
+void getNextMatrix(const char* foldername, const std::string& lastname, std::string& nextname)
+{
+	bool found_last = false;
+	directory_iterator it{ foldername };
+	for (; it != directory_iterator{}; ++it)
+	{
+		if (!is_regular_file(*it))
+			continue;
+		if (it->path().extension() != ".mtx")
+			continue;
+		if (!found_last)
+		{
+			if (it->path().filename() != lastname)
+				continue;
+			else
+			{
+				found_last = true;
+				continue;
+			}
+		}
+		else
+		{
+			nextname = it->path().filename().string();
+			return;
+		}
+	}
+	nextname = std::string("");
+	return;
+}
+
+// #################################################################
+//
+std::string getColumnHeaders(uint32_t approaches, std::string prefix = "")
+{
+	std::string headers(prefix);
+
+	if (approaches & (0x1 << 0))
+		headers.append("cuSparse;");
+	if (approaches & (0x1 << 1))
+		headers.append("acSpGEMM;");
+	// if (approaches & (0x1 << 2))
+	// 	headers.append("nsparse;");
+	// if (approaches & (0x1 << 3))
+	// 	headers.append("RMerge;");
+	// if (approaches & (0x1 << 4))
+	// 	headers.append("bhSparse;");
+
+	headers.append("\n");
+
+	return headers;
+}
+
+// #################################################################
+//
+template<typename ValueType>
+void writeMatrixStats(CSR<ValueType>& mat, const std::string matname, std::ofstream& outfs)
+{
+	typename CSR<ValueType>::Statistics stats = mat.rowStatistics();
+	//"\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max;
+	outfs << matname << ";" << mat.rows << ";" << mat.cols << ";" << mat.nnz << ";"
+		<< stats.mean << ";" << stats.std_dev << ";" << stats.min << ";" << stats.max << ";";
+}
+
+// #################################################################
+//
+template<typename ValueType>
+size_t countFloatingPointOperations(CSR<ValueType>& matA, CSR<ValueType>& matB)
+{
+	size_t count = 0;
+	for (auto nnzAiter = 0; nnzAiter < matA.nnz; ++nnzAiter)
+		count += matB.row_offsets[matA.col_ids[nnzAiter] + 1] - matB.row_offsets[matA.col_ids[nnzAiter]];
+	return count;
+}
+
+// #################################################################
+//
+std::ostream& writeGPUInfo(std::ostream& file)
+{
+	int cudaDevice;
+	cudaGetDevice(&cudaDevice);
+	cudaDeviceProp prop;
+	cudaGetDeviceProperties(&prop, cudaDevice);
+	std::cout << "Going to use " << prop.name << " " << prop.major << "." << prop.minor << "\n";
+
+	file << "name;cc;num_multiprocessors;warp_size;max_threads_per_mp;regs_per_mp;shared_memory_per_mp;total_constant_memory;total_global_memory;clock_rate;max_threads_per_block;max_regs_per_block;max_shared_memory_per_block\n"
+		<< prop.name << ';'
+		<< prop.major << '.'
+		<< prop.minor << ';'
+		<< prop.multiProcessorCount << ';'
+		<< prop.warpSize<< ';'
+		<< prop.maxThreadsPerMultiProcessor << ';'
+		<< prop.regsPerMultiprocessor << ';'
+		<< prop.sharedMemPerMultiprocessor << ';'
+		<< prop.totalConstMem << ';'
+		<< prop.totalGlobalMem << ';'
+		<< prop.clockRate * 1000 << ';'
+		<< prop.maxThreadsPerBlock << ';'
+		<< prop.regsPerBlock << ';'
+		<< prop.sharedMemPerBlock
+		<< std::endl;
+	return file;
+}
+
+// #################################################################
+//
+template<typename ValueType>
+int performSpGEMMTests(int argc, char ** argv)
+{
+	std::string name_extension = "";
+
+	bool runtests = true;
+	if (argc > 2)
+		runtests = std::string(argv[2]) != "0";
+
+	int cudaDevice = 0;
+	if (argc > 3)
+		cudaDevice = std::atoi(argv[3]);
+
+	bool continue_run = true;
+	// if (argc > 4)
+	// 	continue_run = std::string(argv[4]) != "0";
+
+	std::vector<int> trait_init = { 256, 3, 2, 4, 4, 16, 256, 8 };
+	if (argc > 5)
+	{
+
+		std::istringstream traitstream(argv[5]);
+		std::vector<int> input_trait_init;
+		std::string val;
+		while (std::getline(traitstream, val, ','))
+			input_trait_init.push_back(std::stoi(val));
+
+		if (input_trait_init.size() != trait_init.size())
+			printf("Malformed trait init input param; %zu params required; fallback to default\n", trait_init.size());
+		else
+			trait_init = input_trait_init;
+	}
+
+	uint32_t approach_selector = 0xFFFFFFFF;
+	uint32_t first_approach = 0;
+	if (argc > 6)
+	{
+		approach_selector = std::stoi(argv[6]);		
+		first_approach = numTrailingBinaryZeros(approach_selector);
+		if (approach_selector == 0)
+		{
+			printf("ERROR: No approaches selected for testing\n");
+			return 0;
+		}
+	}
+
+	cudaSetDevice(cudaDevice);
+	cudaDeviceProp prop;
+	cudaGetDeviceProperties(&prop, cudaDevice);
+	std::cout << "Going to use " << prop.name << " " << prop.major << "." << prop.minor << "\n";
+	std::string gpuname = prop.name;
+
+	GPUMatrixMatrixMultiplyTraits DefaultTraits(trait_init[0], trait_init[1], trait_init[2], trait_init[3], trait_init[4], trait_init[5], trait_init[6], trait_init[7]);
+	DefaultTraits.preferLoadBalancing = true;
+
+	std::ofstream results;
+	std::ofstream mem_consumption;
+	std::ofstream ours_detailed;
+	std::ofstream stateout;
+	std::ofstream statsout; //This will go horribly wrong: stateout vs statsout
+	std::string trait_string =
+		std::to_string(trait_init[0]) +
+		"_" + std::to_string(trait_init[1]) +
+		"_" + std::to_string(trait_init[2]) +
+		"_" + std::to_string(trait_init[3]) +
+		"_" + std::to_string(trait_init[4]) +
+		"_" + std::to_string(trait_init[5]) +
+		"_" + std::to_string(trait_init[6]) +
+		"_" + std::to_string(trait_init[7]) + "_";
+	std::string statefile = std::string("perf_") + trait_string + nameextension<ValueType>() + name_extension + gpuname + ".state";
+	std::string lastname;
+	std::string current_name;
+	unsigned num_approaches = 6;
+	unsigned current_approach = first_approach;
+	bool finished_write = true;
+	bool fresh_file = !continue_run;
+	if (continue_run)
+	{
+		std::ifstream last(statefile.c_str());
+		if (last)
+			std::getline(last, lastname);
+
+		if (last && !lastname.empty())
+		{
+			current_name = lastname;
+			std::cout << "Continuing run after " << lastname << std::endl;
+			results.open((std::string("perf_") + trait_string + nameextension<ValueType>() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app);
+			mem_consumption.open((std::string("mem_") + trait_string + nameextension<ValueType>() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app);
+			ours_detailed.open((std::string("detailed_") + trait_string + nameextension<ValueType>() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app);
+			statsout.open("matrix_stats.csv", std::ios_base::app);
+			std::cout << "After open" << std::endl;
+
+
+			std::time_t now = std::time(NULL);
+			std::tm * ptm = std::localtime(&now);
+			char buffer[32];
+			// Format: Mo, 15.06.2009 20:20:00
+			std::strftime(buffer, 32, "%a, %d.%m.%Y %H:%M:%S", ptm);
+			std::cout << buffer << std::endl;
+
+			std::string lastapproach;
+			std::getline(last, lastapproach);
+			current_approach = (std::stoi(lastapproach) + 1) % num_approaches;
+			std::string finished_write_string;
+			std::getline(last, finished_write_string);
+			finished_write = !finished_write_string.empty();
+
+			if (!finished_write)
+			{
+				results << -3 << ";";
+				mem_consumption << -3 << ";";
+				finished_write = true;
+			}
+
+			last.close();
+
+			if (!(approach_selector & (0x1 << current_approach)))
+			{
+				//this limits us to 31 approaches :-p
+				uint32_t next_offset = numTrailingBinaryZeros((approach_selector & 0xEFFFFFFF) >> current_approach);
+				if (next_offset < sizeof(uint32_t) * 8)
+				{
+					current_approach += next_offset;
+				}
+				else
+				{
+					current_approach = first_approach;
+
+					results << std::endl;
+					mem_consumption << std::endl;
+					
+					const char  *foldername = argc == 1 ? "." : argv[1];
+					getNextMatrix(foldername, lastname, current_name);
+
+					if (current_name.empty())
+					{
+						return 0;
+					}
+						
+				}
+			}
+			else if (current_approach < std::stoi(lastapproach))
+			{
+				const char  *foldername = argc == 1 ? "." : argv[1];
+				getNextMatrix(foldername, lastname, current_name);
+
+				if (current_name.empty())
+				{
+					return 0;
+				}
+
+				results << std::endl;
+				mem_consumption << std::endl;
+
+				if (current_name.empty())
+				{
+					return 0;
+				}
+			}
+		}
+		else
+		{
+			fresh_file = true;
+		}
+		last.close();
+		stateout.open(statefile.c_str());
+	}
+
+	if (fresh_file)
+	{
+
+		results.open((std::string("perf_") + trait_string + nameextension<ValueType>() + name_extension + gpuname + ".csv").c_str());
+		results << "\"sep=;\"\n";
+		writeGPUInfo(results);
+		results << getColumnHeaders(approach_selector, "\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max;Products;");
+
+		mem_consumption.open((std::string("mem_") + trait_string + nameextension<ValueType>() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app);
+		mem_consumption << "\"sep=;\"\n";
+		writeGPUInfo(mem_consumption);
+		mem_consumption << getColumnHeaders(approach_selector & 14, "\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max;");
+		mem_consumption << std::endl;
+
+		ours_detailed.open((std::string("detailed_") + trait_string + nameextension<ValueType>() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app);
+		ours_detailed << "\"sep=;\"\n";
+		writeGPUInfo(ours_detailed);
+		ours_detailed << std::string("\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max;num_shared_rows;simple_rows;simple_mergers;complex_mergers;generalized_mergers;") +
+			std::string("duration;duration_blockstarts;duration_spgemm;duration_merge_case_computation;duration_merge_simple;duration_merge_max;duration_merge_generalized;duration_write_csr;") +
+			std::string("clear_return;chunk_alloc;chunk_used;restarts;\n");
+		ours_detailed << std::endl;
+
+		statsout.open("matrix_stats.csv", std::ios_base::app);
+		statsout << "\"sep=;\"\n";
+		statsout << "\nMatrix; rows; cols; nnz; r_mean; r_std_dev; r_min; r_max;" << std::endl;
+	}
+
+
+	CSR<ValueType> csrmat, csrmat2, result_mat;
+	
+	char  *foldername;
+	if (argc == 1)
+	{
+		foldername = const_cast<char*>(".");
+	}
+	else
+		foldername = argv[1];
+
+	bool found = fresh_file;
+	directory_iterator it{ foldername };
+
+	for (; it != directory_iterator{}; ++it)
+	{
+		if (!is_regular_file(*it))
+		{
+			continue;
+		}			
+		if (it->path().extension() != ".mtx")
+		{
+			continue;
+		}
+		if (!found && continue_run)
+		{
+			if (current_name.compare(it->path().filename().string()) != 0)
+			{
+				// std::cout << "Filename not current name\n";
+				// std::cout << it->path().filename() << it->path().filename().string().length() <<  std::endl;
+				// std::cout << current_name << current_name.length() << std::endl;
+				continue;
+			}
+			else
+				found = true;
+		}
+
+		std::string testname = it->path().filename().stem().string();
+		std::cout << "\n\nrunning " << testname << std::endl;
+		std::string mantname = it->path().string();
+		std::string csr_name = mantname + typeext<ValueType>() + ".hicsr";
+
+		if (approach_selector & (0x1 << current_approach))
+		{
+			try
+			{
+				std::cout << "trying to load csr file \"" << csr_name << "\"\n";
+				csrmat = loadCSR<ValueType>(csr_name.c_str());
+				std::cout << "succesfully loaded: \"" << csr_name << "\"\n";
+			}
+			catch (std::exception& ex)
+			{
+				std::cout << "could not load csr file:\n\t" << ex.what() << "\n";
+				try
+				{
+					std::cout << "trying to load mtx file \"" << mantname << "\"\n";
+					COO<ValueType> coo_mat = loadMTX<ValueType>(mantname.c_str());
+					convert(csrmat, coo_mat);
+					std::cout << "succesfully loaded and converted: \"" << csr_name << "\"\n";
+				}
+				catch (std::exception& ex)
+				{
+					std::cout << ex.what() << std::endl;
+					std::cout << "Skipping matrix \"" << mantname.c_str() << "\"\n";
+					continue;
+				}
+				try
+				{
+					std::cout << "write csr file for future use\n";
+					storeCSR(csrmat, csr_name.c_str());
+				}
+				catch (std::exception& ex)
+				{
+					std::cout << ex.what() << std::endl;
+				}
+			}
+		}
+
+		if (current_approach == first_approach)
+		{
+			auto rowStats = csrmat.rowStatistics();
+
+			results << testname << ";";
+			results << csrmat.rows << ";" << csrmat.cols << ";" << csrmat.nnz << ";"
+				<< rowStats.mean << ";" << rowStats.std_dev << ";" << rowStats.min << ";" << rowStats.max << ";";
+
+			mem_consumption << testname << ";";
+			mem_consumption << csrmat.rows << ";" << csrmat.cols << ";" << csrmat.nnz << ";"
+				<< rowStats.mean << ";" << rowStats.std_dev << ";" << rowStats.min << ";" << rowStats.max << ";";
+
+			ours_detailed << testname << ";";
+			ours_detailed << csrmat.rows << ";" << csrmat.cols << ";" << csrmat.nnz << ";"
+				<< rowStats.mean << ";" << rowStats.std_dev << ";" << rowStats.min << ";" << rowStats.max << ";";
+		}
+
+		if (continue_run)
+			stateout << it->path().filename().string() << std::endl << current_approach << std::endl;
+
+		if (runtests)
+		{
+			std::cout << "Matrix: " << csrmat.rows << "x" << csrmat.cols << ": " << csrmat.nnz << " nonzeros\n";
+
+			int32_t warmup = 20;
+			int32_t iterations = 20;
+
+			// if (csrmat.nnz>= 5000000)
+			// {
+			// 	warmup = 2;
+			// 	iterations = 10;
+			// }
+
+			try
+			{
+				dCSR<ValueType> gpu_csrmat, gpu_csrmat2, d_csr_cuRes;
+				convert(gpu_csrmat, csrmat, 0);
+				cuSPARSE::CuSparseTest<ValueType> cusparse;
+
+				//calculate the transpose if matrix is not square
+				if (gpu_csrmat.rows != gpu_csrmat.cols)
+				{
+					cusparse.Transpose(gpu_csrmat, gpu_csrmat2);
+					convert(csrmat2, gpu_csrmat2);
+				}
+				else
+				{
+					convert(gpu_csrmat2, csrmat, 0);
+					convert(csrmat2, csrmat, 0);
+				}
+
+				//generate reference solution using cuSparse
+				unsigned cuSubdiv_nnz = 0;
+				if (current_approach != 0 || current_approach == first_approach)
+				{
+					cusparse.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes, cuSubdiv_nnz);
+
+					if (current_approach == first_approach)
+					{
+						//write out stats of result matrix
+						CSR<ValueType> h_csr_cuRes;
+						convert(h_csr_cuRes, d_csr_cuRes);
+						writeMatrixStats(h_csr_cuRes, testname, statsout);
+						size_t fpo = countFloatingPointOperations(csrmat, csrmat2);
+						std::cout << "Multiplication Requires " << fpo << " Floating point operations" << std::endl;
+						statsout << fpo << std::endl;
+						results << fpo << ";";
+						statsout.flush();
+						statsout.close();
+					}
+				}
+
+				switch (current_approach)
+				{
+				case 0:
+				{
+					cuSPARSE::CuSparseTest<ValueType> cuSparseTest;
+					
+					unsigned cuSubdiv_nnz = 0;
+					double cuSparse_duration = 0;
+					for (int i = 0; i < warmup; i++)
+					{
+						cuSparseTest.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes, cuSubdiv_nnz);
+					}
+
+					for (int i = 0; i < iterations; i++)
+					{
+						auto duration = cuSparseTest.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes, cuSubdiv_nnz);
+						cuSparse_duration += duration;
+					}
+					cuSparse_duration /= static_cast<double>(iterations);
+					std::cout << std::setw(20) << "cuSparse -> NNZ: " << cuSubdiv_nnz << std::endl;
+					std::cout << std::setw(20) << "cuSparse SpGEMM: " << cuSparse_duration << " ms" << std::endl;
+
+					results << cuSparse_duration << ";";
+					stateout << 1 << std::endl;
+					break;
+				}
+				case 1:
+				{
+					dCSR<ValueType> d_csr_hiRes;
+					ExecutionStats stats, warmupstats, output_stats;
+					stats.measure_all = false;
+					warmupstats.measure_all = false;
+					output_stats.measure_all = false;
+					double hisparse_duration = 0;
+					double duration_blockstarts = 0.0;
+					double duration_spgemm = 0.0;
+					double duration_merge_case_computation = 0.0;
+					double duration_merge_simple = 0.0;
+					double duration_merge_max = 0.0;
+					double duration_merge_generalized = 0.0;
+					double duration_write_csr = 0.0;
+
+					// Warmup iterations for multiplication
+					for (int i = 0; i < warmup; ++i)
+					{
+						warmupstats.reset();
+						ACSpGEMM::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_csr_hiRes, DefaultTraits, warmupstats, false);
+					}
+
+					// Multiplication
+					for (int i = 0; i < iterations; ++i)
+					{
+						stats.reset();
+						ACSpGEMM::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_csr_hiRes, DefaultTraits, stats, false);
+						output_stats += stats;
+					}
+
+					output_stats.normalize();
+					hisparse_duration = output_stats.duration;
+					duration_blockstarts = output_stats.duration_blockstarts;
+					duration_spgemm = output_stats.duration_spgemm;
+					duration_merge_case_computation = output_stats.duration_merge_case_computation;
+					duration_merge_simple = output_stats.duration_merge_simple;
+					duration_merge_max = output_stats.duration_merge_max;
+					duration_merge_generalized = output_stats.duration_merge_generalized;
+					duration_write_csr = output_stats.duration_write_csr;
+
+
+					std::cout << std::setw(20) << "ac-SpGEMM -> NNZ: " << d_csr_hiRes.nnz << std::endl;
+					std::cout << std::setw(20) << "ac-SpGEMM SpGEMM: " << hisparse_duration << " ms" << std::endl;
+
+					output_stats.mem_clear_return = ACSpGEMM::clear_consistentMemory();
+
+					if (ACSpGEMM::Compare<ValueType>(d_csr_cuRes, d_csr_hiRes, false))
+					{
+						results << hisparse_duration << ";";
+						mem_consumption << output_stats.mem_clear_return + output_stats.mem_allocated_chunks << ";";
+						writeDetailedInfo(output_stats, ours_detailed);
+					}
+					else
+					{
+						results << -2 << ";";
+						mem_consumption << -2 << ";";
+						ours_detailed << std::endl;
+					}
+
+					stateout << 1 << std::endl;
+					break;
+				}
+				case 2:
+				{
+					// dCSR<ValueType> d_nsparse_result_mat;
+					// double nsparse_timing{ 0.0 };
+					// NSparse::MemStats nsparse_stats;
+					// // Warmup iterations for multiplication
+					// for (int i = 0; i < warmup; ++i)
+					// {
+					// 	d_nsparse_result_mat.reset();
+					// 	NSparse::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_nsparse_result_mat/*, nsparse_stats*/);
+					// }
+
+					// // Multiplication
+					// for (int i = 0; i < iterations; ++i)
+					// {
+					// 	d_nsparse_result_mat.reset();
+					// 	nsparse_timing += NSparse::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_nsparse_result_mat/*, nsparse_stats*/);
+					// }
+					// nsparse_timing /= iterations;
+
+					// std::cout << std::setw(20) << "nsparse -> NNZ: " << d_nsparse_result_mat.nnz << std::endl;
+					// std::cout << std::setw(20) << "nsparse SpGEMM: " << nsparse_timing << " ms" << std::endl;
+
+
+					// if (ACSpGEMM::Compare<ValueType>(d_csr_cuRes, d_nsparse_result_mat, false))
+					// {
+					// 	results << nsparse_timing << ";";
+					// 	// mem_consumption << nsparse_stats.mem_peak << ";";
+					// }
+					// else
+					// {
+					// 	results << -2 << ";";
+					// 	mem_consumption << -2 << ";";
+					// }
+
+					// stateout << 1 << std::endl;
+					printf("NSparse not included in public repository\n");
+					break;
+				}
+				case 3:
+				{
+					// dCSR<ValueType> d_rmerge_result_mat;
+					// double rmerge_timing{ 0.0 };
+					// uint32_t rmerge_nnz{ 0 };
+					// bool bitstable{true};
+					// HiSparse::Test::RMergeExecutionStats rmerge_stats;
+					// HostVector<uint32_t> rmerge_offsets(csrmat.row_offsets.get(), csrmat.rows + 1);
+					// rmerge_offsets[csrmat.rows] = csrmat.nnz;
+					// HostVector<uint32_t> rmerge_indices(csrmat.col_ids.get(), csrmat.nnz);
+					// HostVector<ValueType> rmerge_values(csrmat.data.get(), csrmat.nnz);
+					// SparseHostMatrixCSR<ValueType> host_A(csrmat.cols, csrmat.rows, rmerge_values, rmerge_indices, rmerge_offsets);
+					
+					// HostVector<uint32_t> rmerge_offsets2(csrmat2.row_offsets.get(), csrmat2.rows + 1);
+					// rmerge_offsets2[csrmat2.rows] = csrmat2.nnz;
+					// HostVector<uint32_t> rmerge_indices2(csrmat2.col_ids.get(), csrmat2.nnz);
+					// HostVector<ValueType> rmerge_values2(csrmat2.data.get(), csrmat2.nnz);
+					// SparseHostMatrixCSR<ValueType> host_B(csrmat2.cols, csrmat2.rows, rmerge_values2, rmerge_indices2, rmerge_offsets2);
+
+					// SparseDeviceMatrixCSR<ValueType> A = ToDevice(host_A);
+					// SparseDeviceMatrixCSR<ValueType> B = ToDevice(host_B);
+					// SparseDeviceMatrixCSR<ValueType> C;
+		
+					// for (uint32_t i = 0; i < warmup; ++i)
+					// {
+					// 	RMerge::Multiply<ValueType>(A, B, C);
+					// }
+
+					// // Multiplication
+					// for (uint32_t i = 0; i < iterations; ++i)
+					// {
+					// 	rmerge_timing += RMerge::Multiply<ValueType>(A, B, C/*, rmerge_stats*/);
+					// 	rmerge_nnz = C.NonZeroCount();
+					// }
+					// rmerge_timing /= iterations;
+
+					// dCSR<ValueType> d_rmerge_result_mat;
+					// d_rmerge_result_mat.nnz = rmerge_nnz;
+					// d_rmerge_result_mat.rows = csrmat.rows;
+					// d_rmerge_result_mat.cols = csrmat2.cols;
+					// d_rmerge_result_mat.row_offsets = C.RowStarts().Data();
+					// d_rmerge_result_mat.col_ids = C.ColIndices().Data();
+					// d_rmerge_result_mat.data = C.Values().Data();
+
+					// std::cout << std::setw(20) << "RMerge -> NNZ: " << rmerge_nnz << std::endl;
+					// std::cout << std::setw(20) << "RMerge SpGEMM: " << rmerge_timing << " ms" << std::endl;					
+
+					// if (ACSpGEMM::Compare<ValueType>(d_csr_cuRes, d_rmerge_result_mat, false))
+					// {
+					// 	results << rmerge_timing << ";";
+					// 	// mem_consumption << rmerge_stats.mem_peak << ";";
+					// }
+					// else
+					// {
+					// 	results << -2 << ";";
+					// 	mem_consumption << -2 << ";";
+					// }
+
+					// // Let the other object destroy the memory
+					// d_rmerge_result_mat.row_offsets = nullptr;
+					// d_rmerge_result_mat.col_ids = nullptr;
+					// d_rmerge_result_mat.data = nullptr;
+					
+					// stateout << 1 << std::endl;
+					printf("RMerge not included in public repository\n");
+					break;
+				}
+				case 4:
+				{
+					// dCSR<ValueType> d_bhSparse_result_mat;
+					// double bhSparse_timing{ 0.0 };
+					// HiSparse::Test::bhSparseExecutionStats bhsparse_stats;
+					// // Warmup iterations for multiplication
+					// for (int i = 0; i < warmup; ++i)
+					// {
+					// 	d_bhSparse_result_mat.reset();
+					// 	bhSparse::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_bhSparse_result_mat);
+					// }
+
+					// // Multiplication
+					// for (int i = 0; i < iterations; ++i)
+					// {
+					// 	d_bhSparse_result_mat.reset();
+					// 	bhSparse_timing += bhSparse::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_bhSparse_result_mat/*, bhsparse_stats*/);
+					// }
+					// bhSparse_timing /= iterations;
+
+					// std::cout << std::setw(20) << "bhSparse -> NNZ: " << d_bhSparse_result_mat.nnz << std::endl;
+					// std::cout << std::setw(20) << "bhSparse SpGEMM: " << bhSparse_timing << " ms" << std::endl;
+
+					// if (ACSpGEMM::Compare<ValueType>(d_csr_cuRes, d_bhSparse_result_mat, false))
+					// {
+					// 	results << bhSparse_timing << ";";
+					// 	// mem_consumption << bhsparse_stats.mem_peak << ";";
+					// }
+					// else
+					// {
+					// 	results << -2 << ";";
+					// 	mem_consumption << -2 << ";";
+					// }
+
+					// stateout << 1 << std::endl;
+					// printf("After stateout\n");
+					printf("bhSparse not included in public repository\n");
+					break;
+				}
+				default:
+					std::cout << "error: wrong test state" << std::endl;
+					break;
+				}
+			}
+			catch (const SpGEMMException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-4;";
+
+				if (approach_selector & 14 & (1 << current_approach))
+					mem_consumption << "-1;";
+
+				if (current_approach == 1)
+					ours_detailed << std::endl;
+
+				stateout << 0 << std::endl;
+			}
+			catch (const MergeSimpleCaseException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-5;";
+
+				if (approach_selector & 14 & (1 << current_approach))
+					mem_consumption << "-1;";
+
+				if (current_approach == 1)
+					ours_detailed << std::endl;
+
+				stateout << 0 << std::endl;
+			}
+			catch (const MergeMaxChunksCaseException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-6;";
+
+				if (approach_selector & 14 & (1 << current_approach))
+					mem_consumption << "-1;";
+
+				if (current_approach == 1)
+					ours_detailed << std::endl;
+
+				stateout << 0 << std::endl;
+			}
+			catch (const MergeGeneralizedCaseException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-7;";
+
+				if (approach_selector & 14 & (1 << current_approach))
+					mem_consumption << "-1;";
+
+				if (current_approach == 1)
+					ours_detailed << std::endl;
+
+				stateout << 0 << std::endl;
+			}
+			catch (const MergeLoopingException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-8;";
+
+				if (approach_selector & 14 & (1 << current_approach))
+					mem_consumption << "-1;";
+
+				if (current_approach == 1)
+					ours_detailed << std::endl;
+
+				stateout << 0 << std::endl;
+			}
+			catch (const RestartOutOfMemoryException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-9;";
+
+				if (approach_selector & 14 & (1 << current_approach))
+					mem_consumption << "-1;";
+
+				if (current_approach == 1)
+					ours_detailed << std::endl;
+
+				stateout << 0 << std::endl;
+			}
+			catch (const RestartOutOfChunkPointerException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-10;";
+
+				if (approach_selector & 14 & (1 << current_approach))
+					mem_consumption << "-1;";
+
+				if (current_approach == 1)
+					ours_detailed << std::endl;
+
+				stateout << 0 << std::endl;
+			}
+			catch (const std::exception& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-1;";
+
+				if (approach_selector & 14 & (1 << current_approach))
+					mem_consumption << "-1;";
+
+				if (current_approach == 1)
+					ours_detailed << std::endl;
+
+				stateout << 0 << std::endl;
+			}
+			results.flush();
+			mem_consumption.flush();
+			ours_detailed.flush();
+			stateout.flush();
+		}
+		results.flush();
+		results.close();
+		mem_consumption.flush();
+		mem_consumption.close();
+		ours_detailed.flush();
+		ours_detailed.close();
+		stateout.flush();
+		stateout.close();
+
+		if (continue_run)
+			return 1;
+	}
+	std::cout << "Test done\n";
+	return 0;
+}
+
+// #################################################################
+//
+int main(int argc, char *argv[])
+{
+#ifdef _WIN32
+	//surpress crash notification windows (close or debug program window)
+	SetErrorMode(GetErrorMode() | SEM_NOGPFAULTERRORBOX);
+#endif
+
+	std::string value_type = argc > 7 ? argv[7] : "f";
+	if (value_type.compare("f") == 0)
+		return performSpGEMMTests<float>(argc, argv);
+	else
+		return performSpGEMMTests<double>(argc, argv);
+}
\ No newline at end of file
diff --git a/include/GALATIC/minimal_example.cu b/include/GALATIC/minimal_example.cu
new file mode 100644
index 00000000..c59b4de7
--- /dev/null
+++ b/include/GALATIC/minimal_example.cu
@@ -0,0 +1,134 @@
+/*******************************************
+#include "GALATIC/include/CSR.cuh"
+#include "GALATIC/include/dCSR.cuh"
+#include "GALATIC/include/SemiRingInterface.h"
+#include "GALATIC/source/device/Multiply.cuh"
+
+Your "includes" probably needs to look something like the above, rather than what's below. 
+*******************************************/
+
+//#include "include/CSR.cuh"
+//#include "include/dCSR.cuh"
+#include "include/SemiRingInterface.h"
+#include "include/TestSpGEMM.cuh"
+#include <chrono>
+
+//#include "source/device/Multiply.cuh"
+
+struct foo {
+    double a;
+};
+
+struct foo2 {
+    short h;
+    double a;
+    double b;
+    double c;
+
+    double d;
+    short k;
+};
+
+struct Arith_SR : SemiRing<double, double, double>
+{
+  __host__ __device__ double multiply(const double& a, const double& b) const { return a * b; }
+  __host__ __device__ double add(const double& a, const double& b)   const   { return a + b; }
+   __host__ __device__  static double AdditiveIdentity()                  { return     0; }
+};
+
+
+int main(int argc, const char* argv[]) 
+{
+    CSR<Arith_SR::leftInput_t> input_A_CPU;
+    CSR<Arith_SR::rightInput_t> input_B_CPU;
+
+    COO<Arith_SR::leftInput_t> input_A_COO;
+    COO<Arith_SR::rightInput_t> input_B_COO;
+
+    CSR<Arith_SR::output_t> result_mat_CPU;
+    
+    
+
+    
+
+    printf("%s + %s", argv[1], argv[2]);
+    input_A_COO = loadMTX<Arith_SR::leftInput_t>(argv[1]);
+    input_B_COO =  loadMTX<Arith_SR::rightInput_t>(argv[2]);
+
+    convert(input_A_CPU, input_A_COO);
+    convert(input_B_CPU, input_B_COO);
+    
+     // [ [ 1,  2],
+     //   [ 3 4 ] ]
+     cudaDeviceSynchronize();
+
+    
+    // Transfer input matrices onto GPU
+    
+
+    // load data into semiring struct. For this one, we don't need to do anything
+    Arith_SR semiring;
+    
+    
+    // Setup execution options, we'll skip the details for now.
+    
+    const int Threads = 128;
+    const int BlocksPerMP = 1;
+    const int NNZPerThread = 2;
+    const int InputElementsPerThreads = 2;
+    const int RetainElementsPerThreads = 1;
+    const int MaxChunksToMerge = 16;
+    const int MaxChunksGeneralizedMerge = 256; // MAX: 865
+    const int MergePathOptions = 8;
+    
+    
+    GPUMatrixMatrixMultiplyTraits DefaultTraits(Threads, BlocksPerMP, NNZPerThread,
+                                                 InputElementsPerThreads, RetainElementsPerThreads,
+                                                 MaxChunksToMerge, MaxChunksGeneralizedMerge, MergePathOptions );
+    
+    const bool Debug_Mode = false;
+    // DefaultTraits.preferLoadBalancing = true;
+     ExecutionStats stats;
+    // stats.measure_all = false;
+    typedef std::chrono::high_resolution_clock Time;
+    typedef std::chrono::milliseconds ms;
+    typedef std::chrono::duration<float> fsec;
+    auto t0 = Time::now();
+    
+    for (int i =0; i < 10000; i++){
+    // Actually perform the matrix multiplicaiton
+    //if (i % 10 == 0) printf("%i\n",i);
+    dCSR<Arith_SR::leftInput_t> input_A_GPU;
+    dCSR<Arith_SR::rightInput_t> input_B_GPU;
+    convert(input_A_GPU, input_A_CPU);
+    convert(input_B_GPU, input_B_CPU);
+    cudaDeviceSynchronize();
+    dCSR<Arith_SR::output_t> result_mat_GPU;
+        ACSpGEMM::Multiply<Arith_SR>(input_A_GPU, input_B_GPU, result_mat_GPU, DefaultTraits, stats, Debug_Mode, semiring);
+         cudaDeviceSynchronize();
+         //std::cout << result_mat_GPU.nnz << std::endl;
+         convert(result_mat_CPU, result_mat_GPU);
+         cudaDeviceSynchronize();
+    }
+    auto t1 = Time::now();
+    fsec fs = t1 - t0;
+    ms d = std::chrono::duration_cast<ms>(fs);
+    dCSR<Arith_SR::output_t> result_mat_GPU;
+    dCSR<Arith_SR::leftInput_t> input_A_GPU;
+    dCSR<Arith_SR::rightInput_t> input_B_GPU;
+    convert(input_A_GPU, input_A_CPU);
+    convert(input_B_GPU, input_B_CPU);
+    ACSpGEMM::Multiply<Arith_SR>(input_A_GPU, input_B_GPU, result_mat_GPU, DefaultTraits, stats, Debug_Mode, semiring);
+         cudaDeviceSynchronize();
+    printf("Took %d for 1000 tries, for an average of %d\n", d, (d / 1000));
+    TestSpGEMM(input_A_GPU, input_B_GPU, semiring, [=] (const Arith_SR::output_t &a, const Arith_SR::output_t &b) { return std::abs(a-b) < 0.01; }, DefaultTraits);
+
+    convert(result_mat_CPU, result_mat_GPU);
+
+    cudaDeviceSynchronize();
+
+    for (int i =0; i < 4; i++) {
+        std::cout << "nnz: " << i <<   " val " <<  result_mat_CPU.data[i] << std::endl;
+    }
+    
+}
\ No newline at end of file
diff --git a/include/GALATIC/source/checkBitStability.cuh b/include/GALATIC/source/checkBitStability.cuh
new file mode 100644
index 00000000..8c9c1b70
--- /dev/null
+++ b/include/GALATIC/source/checkBitStability.cuh
@@ -0,0 +1,874 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+* performTestCase.cpp
+*
+* ac-SpGEMM
+*
+* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+*------------------------------------------------------------------------------
+*/
+
+// Global includes
+#include <fstream>
+#include <iostream>
+#include <ctime>
+#include <iomanip>
+#include <string>
+#include <sstream> 
+#include <random>
+#include <algorithm>
+#include <cuda_runtime.h>
+
+#ifdef _WIN32
+#include <intrin.h>
+//surpress crash notification windows (close or debug program window)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#else
+#include <x86intrin.h>
+#endif
+
+// Local includes
+#include "CSR.h"
+#include "COO.h"
+#include "Vector.h"
+#include "dCSR.h"
+#include "dVector.h"
+#include "Multiply.h"
+#include "Transpose.h"
+#include "Compare.cuh"
+#include "consistent_memory.h"
+#include "CustomExceptions.h"
+
+#ifdef _WIN32
+#include <filesystem>
+using namespace std::filesystem;
+#else
+#include <experimental/filesystem>
+using namespace std::experimental::filesystem;
+#endif
+
+// CuSparse include
+#include "cusparse/include/cuSparseMultiply.h"
+
+// // Nsparse include
+// #include "nsparse/include/nsparseMultiply.h"
+
+// // RMerge include
+// #include "RMerge/include/rmergeMultiply.h"
+
+// // BhSparse include
+// #include"bhSparse/include/bhSparseMultiply.h"
+
+unsigned int padding = 0;
+template<typename T>
+std::string typeext();
+template<>
+std::string typeext<float>()
+{
+	return std::string("");
+}
+template<>
+std::string typeext<double>()
+{
+	return std::string("d_");
+}
+
+template<typename Format>
+std::string nameextension()
+{
+	return "";
+}
+template<>
+std::string nameextension<double>()
+{
+	return "_d";
+}
+template<>
+std::string nameextension<float>()
+{
+	return "_f";
+}
+
+template<typename Format>
+bool isFloat()
+{
+	return false;
+}
+
+template<>
+bool isFloat<float>()
+{
+	return true;
+}
+
+// #################################################################
+//
+uint32_t numTrailingBinaryZeros(uint32_t n)
+{
+    uint32_t mask = 1;
+    for (uint32_t i = 0; i < 32; i++, mask <<= 1)
+        if ((n & mask) != 0)
+            return i;
+
+    return 32;
+}
+
+// #################################################################
+//
+void writeDetailedInfo(const ExecutionStats& stats, std::ofstream& out)
+{
+	out << stats.shared_rows << ";";
+	out << stats.simple_rows << ";";
+	out << stats.simple_mergers << ";";
+	out << stats.complex_rows << ";";
+	out << stats.generalized_rows << ";";
+	out << stats.duration << ";";
+	out << stats.duration_blockstarts << ";";
+	out << stats.duration_spgemm << ";";
+	out << stats.duration_merge_case_computation << ";";
+	out << stats.duration_merge_simple << ";";
+	out << stats.duration_merge_max << ";";
+	out << stats.duration_merge_generalized << ";";
+	out << stats.duration_write_csr << ";";
+	out << stats.mem_clear_return << ";";
+	out << stats.mem_allocated_chunks << ";";
+	out << stats.mem_used_chunks << ";";
+	out << stats.restarts << ";";
+	out << std::endl;
+}
+
+// #################################################################
+//
+void getNextMatrix(const char* foldername, const std::string& lastname, std::string& nextname)
+{
+	bool found_last = false;
+	directory_iterator it{ foldername };
+	for (; it != directory_iterator{}; ++it)
+	{
+		if (!is_regular_file(*it))
+			continue;
+		if (it->path().extension() != ".mtx")
+			continue;
+		if (!found_last)
+		{
+			if (it->path().filename() != lastname)
+				continue;
+			else
+			{
+				found_last = true;
+				continue;
+			}
+		}
+		else
+		{
+			nextname = it->path().filename().string();
+			return;
+		}
+	}
+	nextname = std::string("");
+	return;
+}
+
+// #################################################################
+//
+std::string getColumnHeaders(uint32_t approaches, std::string prefix = "")
+{
+	std::string headers(prefix);
+
+	if (approaches & (0x1 << 0))
+		headers.append("cuSparse;");
+	if (approaches & (0x1 << 1))
+		headers.append("acSpGEMM;");
+	// if (approaches & (0x1 << 2))
+	// 	headers.append("nsparse;");
+	// if (approaches & (0x1 << 3))
+	// 	headers.append("RMerge;");
+	// if (approaches & (0x1 << 4))
+	// 	headers.append("bhSparse;");
+
+	headers.append("\n");
+
+	return headers;
+}
+
+// #################################################################
+//
+template<typename ValueType>
+void writeMatrixStats(CSR<ValueType>& mat, const std::string matname, std::ofstream& outfs)
+{
+	typename CSR<ValueType>::Statistics stats = mat.rowStatistics();
+	//"\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max;
+	outfs << matname << ";" << mat.rows << ";" << mat.cols << ";" << mat.nnz << ";"
+		<< stats.mean << ";" << stats.std_dev << ";" << stats.min << ";" << stats.max << ";";
+}
+
+// #################################################################
+//
+template<typename ValueType>
+size_t countFloatingPointOperations(CSR<ValueType>& matA, CSR<ValueType>& matB)
+{
+	size_t count = 0;
+	for (auto nnzAiter = 0; nnzAiter < matA.nnz; ++nnzAiter)
+		count += matB.row_offsets[matA.col_ids[nnzAiter] + 1] - matB.row_offsets[matA.col_ids[nnzAiter]];
+	return count;
+}
+
+// #################################################################
+//
+std::ostream& writeGPUInfo(std::ostream& file)
+{
+	int cudaDevice;
+	cudaGetDevice(&cudaDevice);
+	cudaDeviceProp prop;
+	cudaGetDeviceProperties(&prop, cudaDevice);
+	std::cout << "Going to use " << prop.name << " " << prop.major << "." << prop.minor << "\n";
+
+	file << "name;cc;num_multiprocessors;warp_size;max_threads_per_mp;regs_per_mp;shared_memory_per_mp;total_constant_memory;total_global_memory;clock_rate;max_threads_per_block;max_regs_per_block;max_shared_memory_per_block\n"
+		<< prop.name << ';'
+		<< prop.major << '.'
+		<< prop.minor << ';'
+		<< prop.multiProcessorCount << ';'
+		<< prop.warpSize<< ';'
+		<< prop.maxThreadsPerMultiProcessor << ';'
+		<< prop.regsPerMultiprocessor << ';'
+		<< prop.sharedMemPerMultiprocessor << ';'
+		<< prop.totalConstMem << ';'
+		<< prop.totalGlobalMem << ';'
+		<< prop.clockRate * 1000 << ';'
+		<< prop.maxThreadsPerBlock << ';'
+		<< prop.regsPerBlock << ';'
+		<< prop.sharedMemPerBlock
+		<< std::endl;
+	return file;
+}
+
+// #################################################################
+//
+template<typename ValueType>
+int performSpGEMMTests(int argc, char ** argv)
+{
+	std::string name_extension = "";
+
+	bool runtests = true;
+	if (argc > 2)
+		runtests = std::string(argv[2]) != "0";
+
+	int cudaDevice = 0;
+	if (argc > 3)
+		cudaDevice = std::atoi(argv[3]);
+
+	bool continue_run = false;
+	if (argc > 4)
+		continue_run = std::string(argv[4]) != "0";
+
+	std::vector<int> trait_init = { 256, 3, 2, 4, 4, 16, 256, 8 };
+	if (argc > 5)
+	{
+
+		std::istringstream traitstream(argv[5]);
+		std::vector<int> input_trait_init;
+		std::string val;
+		while (std::getline(traitstream, val, ','))
+			input_trait_init.push_back(std::stoi(val));
+
+		if (input_trait_init.size() != trait_init.size())
+			printf("Malformed trait init input param; %zu params required; fallback to default\n", trait_init.size());
+		else
+			trait_init = input_trait_init;
+	}
+
+	uint32_t approach_selector = 0xFFFFFFFF;
+	uint32_t first_approach = 0;
+	if (argc > 6)
+	{
+		approach_selector = std::stoi(argv[6]);		
+		first_approach = numTrailingBinaryZeros(approach_selector);
+		if (approach_selector == 0)
+		{
+			printf("ERROR: No approaches selected for testing\n");
+			return 0;
+		}
+	}
+
+	cudaSetDevice(cudaDevice);
+	cudaDeviceProp prop;
+	cudaGetDeviceProperties(&prop, cudaDevice);
+	std::cout << "Going to use " << prop.name << " " << prop.major << "." << prop.minor << "\n";
+	std::string gpuname = prop.name;
+
+	GPUMatrixMatrixMultiplyTraits DefaultTraits(trait_init[0], trait_init[1], trait_init[2], trait_init[3], trait_init[4], trait_init[5], trait_init[6], trait_init[7]);
+	DefaultTraits.preferLoadBalancing = true;
+
+	std::ofstream results;
+	std::ofstream stateout;
+	std::ofstream statsout; //This will go horribly wrong: stateout vs statsout
+	std::string trait_string =
+		std::to_string(trait_init[0]) +
+		"_" + std::to_string(trait_init[1]) +
+		"_" + std::to_string(trait_init[2]) +
+		"_" + std::to_string(trait_init[3]) +
+		"_" + std::to_string(trait_init[4]) +
+		"_" + std::to_string(trait_init[5]) +
+		"_" + std::to_string(trait_init[6]) +
+		"_" + std::to_string(trait_init[7]) + "_";
+	std::string statefile = std::string("bit_") + trait_string + nameextension<ValueType>() + name_extension + gpuname + ".state";
+	std::string lastname;
+	std::string current_name;
+	unsigned num_approaches = 6;
+	unsigned current_approach = first_approach;
+	bool finished_write = true;
+	bool fresh_file = !continue_run;
+	if (continue_run)
+	{
+		std::ifstream last(statefile.c_str());
+		if (last)
+		{
+			std::getline(last, lastname);
+			current_name = lastname;
+			std::cout << "Continuing run after " << lastname << std::endl;
+			results.open((std::string("bit_") + trait_string + nameextension<ValueType>() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app);
+			statsout.open("matrix_stats.csv", std::ios_base::app);
+			std::cout << "After open" << std::endl;
+
+
+			std::time_t now = std::time(NULL);
+			std::tm * ptm = std::localtime(&now);
+			char buffer[32];
+			// Format: Mo, 15.06.2009 20:20:00
+			std::strftime(buffer, 32, "%a, %d.%m.%Y %H:%M:%S", ptm);
+			std::cout << buffer << std::endl;
+
+			std::string lastapproach;
+			std::getline(last, lastapproach);
+			current_approach = (std::stoi(lastapproach) + 1) % num_approaches;
+			std::string finished_write_string;
+			std::getline(last, finished_write_string);
+			finished_write = !finished_write_string.empty();
+
+			if (!finished_write)
+			{
+				results << -3 << ";";
+				finished_write = true;
+			}
+
+			last.close();
+
+			if (!(approach_selector & (0x1 << current_approach)))
+			{
+				//this limits us to 31 approaches :-p
+				uint32_t next_offset = numTrailingBinaryZeros((approach_selector & 0xEFFFFFFF) >> current_approach);
+				if (next_offset < sizeof(uint32_t) * 8)
+				{
+					current_approach += next_offset;
+				}
+				else
+				{
+					current_approach = first_approach;
+
+					results << std::endl;
+					
+					const char  *foldername = argc == 1 ? "." : argv[1];
+					getNextMatrix(foldername, lastname, current_name);
+
+					if (current_name.empty())
+					{
+						return 0;
+					}
+						
+				}
+			}
+			else if (current_approach < std::stoi(lastapproach))
+			{
+				const char  *foldername = argc == 1 ? "." : argv[1];
+				getNextMatrix(foldername, lastname, current_name);
+
+				if (current_name.empty())
+				{
+					return 0;
+				}
+
+				results << std::endl;
+
+				if (current_name.empty())
+				{
+					return 0;
+				}
+			}
+		}
+		else
+		{
+			fresh_file = true;
+		}
+		last.close();
+		stateout.open(statefile.c_str());
+	}
+
+	if (fresh_file)
+	{
+
+		results.open((std::string("bit_") + trait_string + nameextension<ValueType>() + name_extension + gpuname + ".csv").c_str());
+		results << "\"sep=;\"\n";
+		writeGPUInfo(results);
+		results << getColumnHeaders(approach_selector, "\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max;Products;");
+
+		statsout.open("matrix_stats.csv", std::ios_base::app);
+		statsout << "\"sep=;\"\n";
+		statsout << "\nMatrix; rows; cols; nnz; r_mean; r_std_dev; r_min; r_max;" << std::endl;
+	}
+
+
+	CSR<ValueType> csrmat, csrmat2, result_mat;
+	
+	char  *foldername;
+	if (argc == 1)
+	{
+		foldername = const_cast<char*>(".");
+	}
+	else
+		foldername = argv[1];
+
+	bool found = fresh_file;
+	directory_iterator it{ foldername };
+
+	for (; it != directory_iterator{}; ++it)
+	{
+		if (!is_regular_file(*it))
+		{
+			continue;
+		}			
+		if (it->path().extension() != ".mtx")
+		{
+			continue;
+		}
+		if (!found && continue_run)
+		{
+			if (current_name.compare(it->path().filename().string()) != 0)
+			{
+				// std::cout << "Filename not current name\n";
+				// std::cout << it->path().filename() << it->path().filename().string().length() <<  std::endl;
+				// std::cout << current_name << current_name.length() << std::endl;
+				continue;
+			}
+			else
+				found = true;
+		}
+
+		std::string testname = it->path().filename().stem().string();
+		std::cout << "\n\nrunning " << testname << std::endl;
+		std::string mantname = it->path().string();
+		std::string csr_name = mantname + typeext<ValueType>() + ".hicsr";
+
+		if (approach_selector & (0x1 << current_approach))
+		{
+			try
+			{
+				std::cout << "trying to load csr file \"" << csr_name << "\"\n";
+				csrmat = loadCSR<ValueType>(csr_name.c_str());
+				std::cout << "succesfully loaded: \"" << csr_name << "\"\n";
+			}
+			catch (std::exception& ex)
+			{
+				std::cout << "could not load csr file:\n\t" << ex.what() << "\n";
+				try
+				{
+					std::cout << "trying to load mtx file \"" << mantname << "\"\n";
+					COO<ValueType> coo_mat = loadMTX<ValueType>(mantname.c_str());
+					convert(csrmat, coo_mat);
+					std::cout << "succesfully loaded and converted: \"" << csr_name << "\"\n";
+				}
+				catch (std::exception& ex)
+				{
+					std::cout << ex.what() << std::endl;
+					std::cout << "Skipping matrix \"" << mantname.c_str() << "\"\n";
+					continue;
+				}
+				try
+				{
+					std::cout << "write csr file for future use\n";
+					storeCSR(csrmat, csr_name.c_str());
+				}
+				catch (std::exception& ex)
+				{
+					std::cout << ex.what() << std::endl;
+				}
+			}
+		}
+
+		if (current_approach == first_approach)
+		{
+			auto rowStats = csrmat.rowStatistics();
+
+			results << testname << ";";
+			results << csrmat.rows << ";" << csrmat.cols << ";" << csrmat.nnz << ";"
+				<< rowStats.mean << ";" << rowStats.std_dev << ";" << rowStats.min << ";" << rowStats.max << ";";
+		}
+
+		if (continue_run)
+			stateout << it->path().filename().string() << std::endl << current_approach << std::endl;
+
+		if (runtests)
+		{
+			std::cout << "Matrix: " << csrmat.rows << "x" << csrmat.cols << ": " << csrmat.nnz << " nonzeros\n";
+
+			int32_t iterations = 20;
+
+			try
+			{
+				dCSR<ValueType> gpu_csrmat, gpu_csrmat2, d_csr_cuRes;
+				convert(gpu_csrmat, csrmat, 0);
+				cuSPARSE::CuSparseTest<ValueType> cusparse;
+
+				//calculate the transpose if matrix is not square
+				if (gpu_csrmat.rows != gpu_csrmat.cols)
+				{
+					cusparse.Transpose(gpu_csrmat, gpu_csrmat2);
+					convert(csrmat2, gpu_csrmat2);
+				}
+				else
+				{
+					convert(gpu_csrmat2, csrmat, 0);
+					convert(csrmat2, csrmat, 0);
+				}
+
+				//generate reference solution using cuSparse
+				unsigned cuSubdiv_nnz = 0;
+				if (current_approach != 0 || current_approach == first_approach)
+				{
+					cusparse.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes, cuSubdiv_nnz);
+
+					if (current_approach == first_approach)
+					{
+						//write out stats of result matrix
+						CSR<ValueType> h_csr_cuRes;
+						convert(h_csr_cuRes, d_csr_cuRes);
+						writeMatrixStats(h_csr_cuRes, testname, statsout);
+						size_t fpo = countFloatingPointOperations(csrmat, csrmat2);
+						std::cout << "Multiplication Requires " << fpo << " Floating point operations" << std::endl;
+						statsout << fpo << std::endl;
+						results << fpo << ";";
+						statsout.flush();
+						statsout.close();
+						}
+				}
+
+				switch (current_approach)
+				{
+				case 0:
+				{
+					dCSR<ValueType> d_csr_cuRes_comp;
+					cuSPARSE::CuSparseTest<ValueType> cuSparseTest;
+					bool bitstable{true};
+
+					for (int i = 0; i < iterations; i++)
+					{
+						if(i == 0)
+							cuSparseTest.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes, cuSubdiv_nnz);
+						else
+						{
+							cuSparseTest.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes_comp, cuSubdiv_nnz);
+							if (!(ACSpGEMM::Compare<ValueType>(d_csr_cuRes, d_csr_cuRes_comp, true)))
+							{
+								printf("cuSparse: ## NOT ## Bit-Identical\n");
+								results << -999 << ";";
+								bitstable = false;
+								break;
+							}
+						}
+					}
+					if(bitstable)
+					{
+						printf("cuSparse: Bit-Identical\n");
+						results << 0 << ";";
+					}	
+					stateout << 1 << std::endl;				
+					break;
+				}
+				case 1:
+				{
+					dCSR<ValueType> d_csr_hiRes, d_csr_hiRes_comp;
+					ExecutionStats stats;
+					stats.measure_all = false;
+					bool bitstable{true};
+
+					// Multiplication
+					for (int i = 0; i < iterations; ++i)
+					{
+						stats.reset();
+						if(i == 0)
+							ACSpGEMM::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_csr_hiRes, DefaultTraits, stats, false);
+						else
+						{
+							ACSpGEMM::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_csr_hiRes_comp, DefaultTraits, stats, false);
+							if (!(ACSpGEMM::Compare<ValueType>(d_csr_hiRes, d_csr_hiRes_comp, true)))
+							{
+								printf("AcSpGEMM: ## NOT ## Bit-Identical\n");
+								results << -999 << ";";
+								bitstable = false;
+								break;
+							}
+						}
+					}
+
+					if(bitstable)
+					{
+						printf("AcSpGEMM: Bit-Identical\n");
+						results << 0 << ";";
+					}	
+					stateout << 1 << std::endl;				
+					break;
+				}
+				case 2:
+				{
+					// dCSR<ValueType> d_nsparse_result_mat, d_nsparse_result_mat_comp;
+					// bool bitstable{true};
+					// // Multiplication
+					// for (int i = 0; i < iterations; ++i)
+					// {
+					// 	d_nsparse_result_mat_comp.reset();
+					// 	if(i == 0)
+					// 		NSparse::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_nsparse_result_mat);
+					// 	else
+					// 	{
+					// 		NSparse::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_nsparse_result_mat_comp);
+					// 		if (!(ACSpGEMM::Compare<ValueType>(d_nsparse_result_mat, d_nsparse_result_mat_comp, true)))
+					// 		{
+					// 			printf("Nsparse: ## NOT ## Bit-Identical\n");
+					// 			results << -999 << ";";
+					// 			bitstable = false;
+					// 			break;
+					// 		}
+					// 	}
+					// }
+
+					// if(bitstable)
+					// {
+					// 	printf("Nsparse: Bit-Identical\n");
+					// 	results << 0 << ";";
+					// }
+					
+					// stateout << 1 << std::endl;
+					printf("Nsparse not included in public repository\n");
+					break;
+				}
+				case 3:
+				{
+					// bool bitstable{true};
+					// uint32_t rmerge_nnz{ 0 };
+					// HiSparse::Test::RMergeExecutionStats rmerge_stats;
+					// HostVector<uint32_t> rmerge_offsets(csrmat.row_offsets.get(), csrmat.rows + 1);
+					// rmerge_offsets[csrmat.rows] = csrmat.nnz;
+					// HostVector<uint32_t> rmerge_indices(csrmat.col_ids.get(), csrmat.nnz);
+					// HostVector<ValueType> rmerge_values(csrmat.data.get(), csrmat.nnz);
+					// SparseHostMatrixCSR<ValueType> host_A(csrmat.cols, csrmat.rows, rmerge_values, rmerge_indices, rmerge_offsets);
+					
+					// HostVector<uint32_t> rmerge_offsets2(csrmat2.row_offsets.get(), csrmat2.rows + 1);
+					// rmerge_offsets2[csrmat2.rows] = csrmat2.nnz;
+					// HostVector<uint32_t> rmerge_indices2(csrmat2.col_ids.get(), csrmat2.nnz);
+					// HostVector<ValueType> rmerge_values2(csrmat2.data.get(), csrmat2.nnz);
+					// SparseHostMatrixCSR<ValueType> host_B(csrmat2.cols, csrmat2.rows, rmerge_values2, rmerge_indices2, rmerge_offsets2);
+
+					// SparseDeviceMatrixCSR<ValueType> A = ToDevice(host_A);
+					// SparseDeviceMatrixCSR<ValueType> B = ToDevice(host_B);
+					// SparseDeviceMatrixCSR<ValueType> C, C_comp;
+		
+
+					// RMerge::Multiply<ValueType>(A, B, C);
+					// dCSR<ValueType> d_rmerge_result_mat, d_rmerge_result_mat_comp;
+					// d_rmerge_result_mat.nnz = rmerge_nnz;
+					// d_rmerge_result_mat.rows = csrmat.rows;
+					// d_rmerge_result_mat.cols = csrmat2.cols;
+					// d_rmerge_result_mat.row_offsets = C.RowStarts().Data();
+					// d_rmerge_result_mat.col_ids = C.ColIndices().Data();
+					// d_rmerge_result_mat.data = C.Values().Data();
+
+					// // Multiplication
+					// for (uint32_t i = 0; i < iterations; ++i)
+					// {
+					// 	RMerge::Multiply<ValueType>(A, B, C_comp);
+					// 	rmerge_nnz = C_comp.NonZeroCount();
+					// 	d_rmerge_result_mat_comp.nnz = rmerge_nnz;
+					// 	d_rmerge_result_mat_comp.rows = csrmat.rows;
+					// 	d_rmerge_result_mat_comp.cols = csrmat2.cols;
+					// 	d_rmerge_result_mat_comp.row_offsets = C_comp.RowStarts().Data();
+					// 	d_rmerge_result_mat_comp.col_ids = C_comp.ColIndices().Data();
+					// 	d_rmerge_result_mat_comp.data = C_comp.Values().Data();
+					// 	if (!(ACSpGEMM::Compare<ValueType>(d_rmerge_result_mat, d_rmerge_result_mat, true)))
+					// 	{
+					// 		printf("RMerge: ## NOT ## Bit-Identical\n");
+					// 		results << -999 << ";";
+					// 		bitstable = false;
+					// 		break;
+					// 	}
+					// }
+
+					// // Let the other object destroy the memory
+					// d_rmerge_result_mat.row_offsets = nullptr;
+					// d_rmerge_result_mat.col_ids = nullptr;
+					// d_rmerge_result_mat.data = nullptr;
+					
+					// if(bitstable)
+					// {
+					// 	printf("RMerge: Bit-Identical\n");
+					// 	results << 0 << ";";
+					// }
+					// stateout << 1 << std::endl;
+					printf("RMerge not included in public repository\n");
+					break;
+				}
+				case 4:
+				{
+					// dCSR<ValueType> d_bhSparse_result_mat, d_bhSparse_result_mat_comp;
+					// bool bitstable{true};
+					// HiSparse::Test::bhSparseExecutionStats bhsparse_stats;
+
+					// // Multiplication
+					// for (int i = 0; i < iterations; ++i)
+					// {
+					// 	d_bhSparse_result_mat_comp.reset();
+					// 	if(i == 0)
+					// 		bhSparse::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_bhSparse_result_mat);
+					// 	else
+					// 	{
+					// 		bhSparse::Multiply<ValueType>(gpu_csrmat, gpu_csrmat2, d_bhSparse_result_mat_comp);
+					// 		if (!(ACSpGEMM::Compare<ValueType>(d_bhSparse_result_mat, d_bhSparse_result_mat_comp, true)))
+					// 		{
+					// 			printf("BhSparse: ## NOT ## Bit-Identical\n");
+					// 			results << -999 << ";";
+					// 			bitstable = false;
+					// 			break;
+					// 		}
+					// 	}
+					// }
+
+					// if(bitstable)
+					// {
+					// 	printf("BhSparse: Bit-Identical\n");
+					// 	results << 0 << ";";
+					// }					
+					// stateout << 1 << std::endl;
+					printf("bhSparse not included in public repository\n");
+					break;
+				}
+				default:
+					std::cout << "error: wrong test state" << std::endl;
+					break;
+				}
+			}
+			catch (const SpGEMMException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-4;";
+
+				stateout << 0 << std::endl;
+			}
+			catch (const MergeSimpleCaseException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-5;";
+
+				stateout << 0 << std::endl;
+			}
+			catch (const MergeMaxChunksCaseException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-6;";
+
+				stateout << 0 << std::endl;
+			}
+			catch (const MergeGeneralizedCaseException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-7;";
+
+				stateout << 0 << std::endl;
+			}
+			catch (const MergeLoopingException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-8;";
+
+				stateout << 0 << std::endl;
+			}
+			catch (const RestartOutOfMemoryException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-9;";
+
+				stateout << 0 << std::endl;
+			}
+			catch (const RestartOutOfChunkPointerException& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-10;";
+
+				stateout << 0 << std::endl;
+			}
+			catch (const std::exception& e) {
+				std::cout << "Error:\n" << e.what() << "\n";
+
+				results << "-1;";
+
+				stateout << 0 << std::endl;
+			}
+			results.flush();
+			stateout.flush();
+		}
+		results.flush();
+		results.close();
+		stateout.flush();
+		stateout.close();
+
+		if (continue_run)
+			return 1;
+	}
+	std::cout << "Test done\n";
+	return 0;
+}
+
+// #################################################################
+//
+int main(int argc, char *argv[])
+{
+#ifdef _WIN32
+	//surpress crash notification windows (close or debug program window)
+	SetErrorMode(GetErrorMode() | SEM_NOGPFAULTERRORBOX);
+#endif
+
+	std::string value_type = argc > 7 ? argv[7] : "f";
+	if (value_type.compare("f") == 0)
+		return performSpGEMMTests<float>(argc, argv);
+	else
+		return performSpGEMMTests<double>(argc, argv);
+}
\ No newline at end of file
diff --git a/include/GALATIC/source/device/Compare.cuh b/include/GALATIC/source/device/Compare.cuh
new file mode 100644
index 00000000..539e8b37
--- /dev/null
+++ b/include/GALATIC/source/device/Compare.cuh
@@ -0,0 +1,98 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+* Compare.cu
+*
+* ac-SpGEMM
+*
+* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+*------------------------------------------------------------------------------
+*/
+
+// Global includes
+#include <stdio.h>
+#include <stdint.h>
+
+// Local includes
+#include "Compare.h"
+#include "common.h"
+
+//#define VERIFICATION_TEXT
+
+template <typename DataType>
+__global__ void d_compare(int in_rows, int in_cols, const uint32_t* __restrict reference_offset, const uint32_t* __restrict reference_indices, const DataType* __restrict reference_values,
+	const uint32_t* __restrict compare_offset, const uint32_t* __restrict compare_indices, const DataType* __restrict compare_values, bool compare_data, double epsilon, uint32_t* verification)
+{
+	int tid = threadIdx.x + blockDim.x * blockIdx.x;
+	if (tid >= in_rows)
+		return;
+
+	uint32_t ref_offset = reference_offset[tid];
+	uint32_t comp_offset = compare_offset[tid];
+	uint32_t ref_number_entries = reference_offset[tid + 1] - ref_offset;
+	uint32_t comp_number_entries = compare_offset[tid + 1] - comp_offset;
+
+	if (ref_number_entries != comp_number_entries)
+	{
+#ifdef VERIFICATION_TEXT
+		printf("---------- Row: %u | Row length not identical: (Ref|Comp) : (%u|%u)\n",tid, ref_number_entries, comp_number_entries);
+#endif
+		*verification = 1;
+	}
+
+	uint32_t num_entries = min(ref_number_entries, comp_number_entries);
+
+	for (uint32_t i = 0; i < num_entries; ++i)
+	{
+		if (reference_indices[ref_offset + i] != compare_indices[comp_offset + i])
+		{
+#ifdef VERIFICATION_TEXT
+			printf("Row: %u | Row indices do NOT match: (Ref|Comp) : (%u|%u) - pos: %u/%u\n", tid, reference_indices[ref_offset + i], compare_indices[comp_offset + i], i, num_entries);
+#endif
+			*verification = 1;
+		}
+		if (compare_data)
+		{
+			if (reference_values[ref_offset + i] != compare_values[comp_offset + i])
+			{
+#ifdef VERIFICATION_TEXT
+				printf("Row: %u | Values do NOT match: (Ref|Comp) : (%f|%f) - pos: %u/%u\n", tid, reference_values[ref_offset + i], compare_values[comp_offset + i], i, num_entries);
+#endif
+				*verification = 1;
+			}
+		}
+	}
+
+	return;
+}
+};
+
diff --git a/include/GALATIC/source/device/Multiply.cuh b/include/GALATIC/source/device/Multiply.cuh
new file mode 100644
index 00000000..2d295151
--- /dev/null
+++ b/include/GALATIC/source/device/Multiply.cuh
@@ -0,0 +1,938 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+* Multiply.cpp
+*
+* ac-SpGEMM
+*
+* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+*------------------------------------------------------------------------------
+*/
+#pragma once
+
+#include "memory.cuh"
+// Global includes
+#include <bitset>
+#include <memory>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+#include <math.h>
+#ifdef _WIN32
+#include <intrin.h>
+#define LZCNT __lzcnt
+#else
+//#include <x86intrin.h>
+#define LZCNT __builtin_clzll
+#endif
+
+// Local includes
+#include "../../include/Multiply.h"
+#include "../../include/device/MultiplyKernels.h"
+#include "../../include/device/consistent_gpu_memory.h"
+#include "../../include/devicetools/stream.h"
+#include "../../include/meta_utils.h"
+#include "../../include/device/acSpGEMM_DetermineBlockStarts.cuh"
+#include "../../include/device/acSpGEMM_SpGEMM.cuh"
+#include "../../include/device/acSpGEMM_MergeSimple.cuh"
+#include "../../include/device/acSpGEMM_MergeMaxChunks.cuh"
+#include "../../include/device/acSpGEMM_MergeGeneralized.cuh"
+#include "../../include/device/acSpGEMM_ChunksToCSR.cuh"
+#include "../../include/device/HelperFunctions.cuh"
+#include "../../include/CustomExceptions.h"
+
+
+#pragma once
+
+#include "../../include/dCSR.cuh"
+#include "../../include/execution_stats.h"
+#include "../../include/default_scheduling_traits.h"
+
+void startTimer(cudaEvent_t& start, CUstream stream = 0)
+{
+	HANDLE_ERROR(cudaEventRecord(start, stream));
+}
+
+float recordTimer(cudaEvent_t& start, cudaEvent_t& end, CUstream stream = 0)
+{
+	float time;
+	HANDLE_ERROR(cudaEventRecord(end, stream));
+	HANDLE_ERROR(cudaEventSynchronize(end));
+	HANDLE_ERROR(cudaEventElapsedTime(&time, start, end));
+	return time;
+	return 0;
+}
+
+using IndexType = uint32_t;
+using OffsetType = uint32_t;
+
+
+namespace ACSpGEMM {
+
+	template<typename T>
+	__host__ __forceinline__ T divup(T a, T b)
+	{
+		return (a + b - 1) / b;
+	}
+
+	template<typename T>
+	__host__ __forceinline__ T alignment(T size, size_t alignment)
+	{
+		return divup<T>(size, alignment) * alignment;
+	}
+
+	int id; 
+
+	template <typename DataType, uint32_t threads, uint32_t blocks_per_mp, uint32_t nnz_per_thread, uint32_t input_elements_per_thread, uint32_t retain_elements_per_thread, uint32_t merge_max_chunks, uint32_t generalized_merge_max_path_options, uint32_t merge_max_path_options,  bool DEBUG_MODE,
+            typename T, typename U, typename Label,
+            typename SEMIRING_t>
+            void MultiplyImplementation(const dCSR<typename SEMIRING_t::leftInput_t>& matA, const dCSR<typename SEMIRING_t::rightInput_t>& matB, dCSR<typename SEMIRING_t::output_t>& matOut, const GPUMatrixMatrixMultiplyTraits& traits, ExecutionStats& stats, SEMIRING_t semiring)
+	{
+		HANDLE_ERROR(cudaGetLastError());
+
+		using ConsistentGPUMemory = ConsistentMemory<MemorySpace::device>;
+
+		// the magic numbers to make it run smoother
+		const float OverallocationFactor = 1.1f;
+		const int ChunkPointerOverestimationFactor = 4;
+		const float ChunkOverallocationFactor = 1.0f;
+		using UintBitSet = std::bitset<sizeof(uint32_t)>;
+
+		if(DEBUG_MODE)
+		{
+			std::cout << "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n";
+			std::cout << "THREADS: " << threads << " | NNZPerThread: " << nnz_per_thread << " | InputElementsPerThreads: " << input_elements_per_thread << " | RetainElementsPerThreads: " << retain_elements_per_thread;
+			std::cout << " | MaxChunks: " << merge_max_chunks << " | MergePathOptions: " << merge_max_path_options << "| ChunkpointerOverestimationFactor: " << ChunkPointerOverestimationFactor << "\n";
+			std::cout << "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n";
+		}
+
+		// Helper variables
+		size_t memory_usage_in_Bytes{ 0 };
+		const size_t chunckAllocationsSize{ 256 };
+		const size_t numFlags{ 128 };
+		const size_t numCounters{ 3 };
+		const size_t mergeTypeCounters{ 4 };
+		static size_t maxExpectedNNZ{ 500000000 }; //limit allocation...
+		static size_t minExpectedNNZ{ 10000000 }; //limit allocation...
+								//	  10000000
+		static float lastChunckBufferRequirementRatio{ 1.0f };
+		const uint32_t nnzperblock{ threads * nnz_per_thread };
+		size_t run{ 0 }, chunk_pointer_restart_run{ 0 };
+		bool completed{ false };
+		bool rowmerging{ false };
+		MergeCaseOffsets mergeBlocks;
+		uint32_t* currentCounters, *currentChunckAllocation, *currentFlag;
+		uint32_t numSharedRows;
+		size_t size_to_allocate;
+		size_t upper_limit{ 3LL * 1024 * 1024 * 1024 };
+
+		// Kernels
+		AcSpGEMMKernels spgemm(threads);
+
+		// Matrix information
+		size_t Arows = matA.rows;
+		size_t Acols = matA.cols;
+		size_t Brows = matB.rows;
+		size_t Bcols = matB.cols;
+		size_t Crows = Arows;
+		size_t Ccols = Bcols;
+
+		 if (Acols != Brows)
+		 	throw std::runtime_error("Unable to multiply matrix with matrix - invalid dimensions");
+
+		// Matrix Output estimation
+		double a_avg_row = matA.nnz / static_cast<double>(Arows);
+		double b_avg_row = matB.nnz / static_cast<double>(Brows);
+		double avg_row_overlap = b_avg_row / Bcols;
+		// note geometric sequence
+		double output_estimate = OverallocationFactor*Arows*b_avg_row * (1.0 - pow(1.0 - avg_row_overlap, a_avg_row)) / (avg_row_overlap);
+
+		// chunks might get created earlier
+		double single_chunk_estimate = b_avg_row;
+		double current_overlap = avg_row_overlap;
+		double merges;
+		for (merges = 1; merges < static_cast<size_t>(a_avg_row + 1.0); ++merges)
+		{
+			if (single_chunk_estimate >= retain_elements_per_thread*threads)
+				break;
+			single_chunk_estimate += (1 - current_overlap)*b_avg_row;
+			current_overlap = current_overlap + (1 - current_overlap)*avg_row_overlap;
+		}
+		HANDLE_ERROR(cudaGetLastError());
+
+		double intermediate_estimate = OverallocationFactor * a_avg_row / std::min(merges, a_avg_row) * single_chunk_estimate * Arows;
+		double mergepointer_estimate = std::max(intermediate_estimate, output_estimate) / (retain_elements_per_thread*threads) + 16 * 1024;
+		size_t expectedNNZ = std::max(minExpectedNNZ, std::min(maxExpectedNNZ, static_cast<size_t>(lastChunckBufferRequirementRatio*std::max(intermediate_estimate, output_estimate))));
+		size_to_allocate = (std::max(sizeof(typename SEMIRING_t::rightInput_t), sizeof(typename SEMIRING_t::output_t))+ sizeof(IndexType))*expectedNNZ*ChunkOverallocationFactor;
+		size_t free, total;
+		cudaMemGetInfo(&free, &total);
+		upper_limit = std::min(upper_limit, free / 3);
+		if (size_to_allocate > upper_limit)
+			size_to_allocate = upper_limit;
+		if(DEBUG_MODE)
+		{
+			std::cout << "A: " << Arows << "x" << Acols << " NNZ: " << matA.nnz << " avg row: " << a_avg_row << "  " << "B: " << Brows << "x" << Bcols << " NNZ: " << matB.nnz << " avg row: " << b_avg_row << "\n";
+			std::cout << "expected row overlap: " << avg_row_overlap << " overallocation: " << OverallocationFactor << "\n";
+			std::cout << "expected nnz: " << static_cast<size_t>(round(output_estimate)) << " expected temp: " << static_cast<size_t>(round(intermediate_estimate)) << " mem alloc: " << expectedNNZ << "\n";
+			std::cout << "mergepointer alloc " << static_cast<size_t>(ChunkPointerOverestimationFactor*mergepointer_estimate) << " mergepointer estimate: " << mergepointer_estimate << "\n";
+		}
+
+		HANDLE_ERROR(cudaGetLastError());
+
+		// CUDA variables
+		CUstream stream = 0;
+		int blockSize = 256;
+		int gridSize(divup<int>(Arows + 1, blockSize));
+		const int number_merge_streams = 3;
+		static CUstream mergeStreams[number_merge_streams];
+		for (int i = 0; i < number_merge_streams; ++i)
+		{
+			if(stats.measure_all)
+				mergeStreams[i] = stream;
+			else
+				cudaStreamCreate(&mergeStreams[i]);
+		}
+		HANDLE_ERROR(cudaGetLastError());
+
+		cudaEvent_t ce_start, ce_stop, individual_start, individual_stop;
+		cudaEventCreate(&ce_start); cudaEventCreate(&ce_stop); cudaEventCreate(&individual_start); cudaEventCreate(&individual_stop);
+		HANDLE_ERROR(cudaGetLastError());
+		// GPU Memory Helper structures - general
+		static ConsistentGPUMemory chunckPointers;
+		static ConsistentGPUMemory combinedGeneralMemory;
+		static ConsistentGPUMemory chunk_counter_cptr;
+		uint32_t* chunckAllocations{ nullptr };
+		uint32_t* blockStarts{ nullptr };
+		uint32_t* sharedRowTracker{ nullptr };
+		void** outputRowListHead{ nullptr };
+		uint32_t* outputRowChunkCounter{ nullptr };
+		uint32_t* completion_status{ nullptr };
+		uint32_t* chunk_counter{ nullptr };
+		void* prefixSumTemp{ nullptr };
+
+		// GPU Memory Helper structures - merge stage allocation
+		static ConsistentGPUMemory combineBlockOffsets; // SIZE: combineBlockOffsetsSize * sizeof(IndexType)
+
+		static ConsistentGPUMemory chunk_indices_cptr; // SIZE:  ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * 8
+		static ConsistentGPUMemory chunk_values_cptr; // SIZE: ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * 8       
+				 //FIXME: RL - This is no longer *8, but sizeof(Either<typename SEMIRING_t::input_t*, typename SEMIRING_t::output_t*>). Probably *16 because alignment. this shoudln't matter?
+			   	//FIXME:  till confirmed/tested irrelevant
+
+		static ConsistentGPUMemory chunk_multiplier_cptr; // SIZE: ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * 8
+
+		static ConsistentGPUMemory combinedMergeStageMemory;
+		static uint32_t* shared_rows_handled{ nullptr };
+		static uint32_t* restart_completion{ nullptr };
+		static uint32_t* chunkElementConsumedAndPath{ nullptr };
+		uint32_t* num_chunks{ nullptr };
+		uint32_t* chunkElementCountDataOffset{ nullptr };
+		uint32_t* sample_offset{ nullptr };
+		static IndexType** chunk_indices{ nullptr };
+		static Either<typename SEMIRING_t::rightInput_t*, typename SEMIRING_t::output_t*>* chunk_values{ nullptr };
+		static typename SEMIRING_t::leftInput_t* chunk_multiplier{ nullptr };
+		HANDLE_ERROR(cudaDeviceSynchronize());
+
+
+		// CPU Memory Helper structures
+		static RegisteredMemoryVar<size_t> chunkPointerSize(0);
+		static RegisteredMemoryVar<size_t> outputRowInfoSize(0);
+		static RegisteredMemoryVar<size_t> prefixSumTempMemSize;
+		static RegisteredMemoryVar<size_t> combineBlockOffsetsSize(0);
+		static RegisteredMemoryVar<size_t> mergeBlocksAlloc(0);
+		static RegisteredMemoryVar<size_t> lastSharedRows(0);
+		static RegisteredMemoryVar<size_t> merge_simple_rows(0);
+		static RegisteredMemoryVar<size_t> merge_max_chunks_rows(0);
+		static RegisteredMemoryVar<size_t> merge_generalized_rows(0);
+		uint32_t flagsAndListAllocCounters[numFlags + numCounters];
+		size_t tempChunkBufferSizes[256];
+		CU::unique_ptr tempChunkBuffers[256];
+		tempChunkBufferSizes[0] = alignment(size_to_allocate, 16);
+		//
+		// TSOPF_RS_b300_c2.mtx shows very weird results if this is done here??
+		//
+		// Allocate temporary memory for chunks
+		tempChunkBuffers[0] = CU::allocMemory(tempChunkBufferSizes[0]);
+
+		HANDLE_ERROR(cudaDeviceSynchronize());
+
+		// ##############################
+		startTimer(ce_start, stream);
+		// ##############################
+		if(stats.measure_all)
+			startTimer(individual_start, stream);
+
+
+		// Allocate memory for block offsets
+		uint32_t requiredBlocks = divup<uint32_t>(matA.nnz, nnzperblock);
+		HANDLE_ERROR(cudaDeviceSynchronize());
+
+		// Allocate memory for chunk and shared row tracker
+		if (outputRowInfoSize < Crows)
+		{
+			//----------------------------------------------------------
+			prefixSumTempMemSize = spgemm.tempMemSize<IndexType>(Crows);
+			//----------------------------------------------------------
+			outputRowInfoSize = Crows;
+		}
+
+		HANDLE_ERROR(cudaGetLastError());
+
+
+		// Allocate combined general memory
+		size_t combinedGeneralMemory_size =
+			/*chunckAllocations*/alignment((chunckAllocationsSize + numFlags + numCounters + mergeTypeCounters) * sizeof(uint32_t), 8) +
+			/*blockStarts*/ alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) +
+			/*completion_status*/ alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) +
+			///*chunk_counter*/ alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) +
+			/*outputRowListHead*/ alignment(Crows * sizeof(void*), 8) +
+			/*outputRowChunkCounter*/ alignment(Crows * sizeof(uint32_t), 8) +
+			/*sharedRowTracker*/ alignment(Crows * sizeof(uint32_t), 8) +
+			/*prefixSumTemp*/ alignment(static_cast<size_t>(prefixSumTempMemSize), 8);
+		combinedGeneralMemory.assure(combinedGeneralMemory_size);
+		memory_usage_in_Bytes += combinedGeneralMemory_size;
+
+		// Place pointers in correct positions
+		outputRowListHead = combinedGeneralMemory.get<void*>();
+		chunckAllocations = reinterpret_cast<uint32_t*>(outputRowListHead + (alignment(Crows * sizeof(void*), 8) / sizeof(void*)));
+		completion_status = chunckAllocations + alignment((chunckAllocationsSize + numFlags + numCounters + mergeTypeCounters) * sizeof(uint32_t), 8) / sizeof(uint32_t);
+		/*chunk_counter = completion_status + (alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) / sizeof(uint32_t));*/
+		blockStarts = completion_status + (alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) / sizeof(uint32_t));
+		outputRowChunkCounter = blockStarts + (alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) / sizeof(uint32_t));
+		sharedRowTracker = outputRowChunkCounter + (alignment(Crows * sizeof(uint32_t), 8) / sizeof(uint32_t));
+		prefixSumTemp = reinterpret_cast<void*>(sharedRowTracker + (alignment(Crows * sizeof(uint32_t), 8) / sizeof(uint32_t)));
+		HANDLE_ERROR(cudaGetLastError());
+
+		// TODO: Move back in, currently sometimes produces crashes for whatever reason
+		chunk_counter_cptr.assure((requiredBlocks + 2) * sizeof(uint32_t));
+		chunk_counter = chunk_counter_cptr.get<uint32_t>();
+		//std::cout << "MADE IT IN " << std::endl;
+		// Allocate memory for chunk pointers
+		size_t targetChunkPointerSize =ChunkPointerOverestimationFactor*mergepointer_estimate; //fixme : rl
+		if (chunkPointerSize < targetChunkPointerSize)
+		{
+			chunkPointerSize = targetChunkPointerSize;
+			chunckPointers.assure((targetChunkPointerSize) * sizeof(void*));
+			memory_usage_in_Bytes += (targetChunkPointerSize) * sizeof(void*);
+		}
+
+		// Allocate memory for offsets
+		CU::unique_ptr newmat_offsets;
+		if (matOut.rows != Crows)
+		{
+			newmat_offsets = CU::allocMemory((Crows + 1) * sizeof(OffsetType));
+
+			memory_usage_in_Bytes += (Crows + 1) * sizeof(OffsetType);
+		}
+		else
+		{
+			newmat_offsets.consume(reinterpret_cast<CUdeviceptr>(matOut.row_offsets));
+			matOut.row_offsets = nullptr;
+		}
+		HANDLE_ERROR(cudaDeviceSynchronize());
+
+		spgemm.setLaunchDimensions(gridSize, stream, blockSize);
+		HANDLE_ERROR(cudaDeviceSynchronize());
+		//----------------------------------------------------------
+		spgemm.h_DetermineBlockStarts<OffsetType, threads*nnz_per_thread>(
+			Arows,
+			matA.row_offsets,
+			blockStarts,
+			reinterpret_cast<uint64_t*>(outputRowListHead),
+			outputRowChunkCounter,
+			newmat_offsets.get<uint32_t>(),
+			requiredBlocks,
+			completion_status,
+			(chunckAllocationsSize + numFlags + numCounters + mergeTypeCounters),
+			chunckAllocations,
+			(lastSharedRows),
+			shared_rows_handled,
+			restart_completion,
+			chunk_counter,
+			(lastSharedRows) * (generalized_merge_max_path_options + helper_overhead),
+			chunkElementConsumedAndPath
+			);
+		HANDLE_ERROR(cudaDeviceSynchronize());
+		//----------------------------------------------------------
+		if(stats.measure_all)
+			stats.duration_blockstarts = recordTimer(individual_start, individual_stop, stream);
+		HANDLE_ERROR(cudaGetLastError());
+		do
+		{
+			HANDLE_ERROR(cudaDeviceSynchronize());
+			currentChunckAllocation = chunckAllocations + (2 * run);
+			currentFlag = chunckAllocations + (chunckAllocationsSize + run + chunk_pointer_restart_run);
+			currentCounters = chunckAllocations + (chunckAllocationsSize + numFlags);
+			if (!rowmerging)
+			{
+				if(DEBUG_MODE)
+				{
+					std::cout << "################################################\n";
+					std::cout << "Start spgemm stage with " << requiredBlocks<<  " and run: " << run << "\n";
+				}
+				if(stats.measure_all)
+					startTimer(individual_start, stream);
+
+				// $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
+				// Stage 2 - Compute SpGEMM
+				// $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
+				spgemm.setLaunchDimensions(requiredBlocks, stream, threads);
+				HANDLE_ERROR(cudaDeviceSynchronize());
+				if (Arows < 0x10000 && Bcols < 0x10000)
+				{
+				if(DEBUG_MODE)
+				{
+					std::cout << "Case 1:\n";
+				}
+				    HANDLE_ERROR(cudaGetLastError());
+					cudaDeviceSynchronize();
+
+					//we can just use 16bit
+					//----------------------------------------------------------
+					spgemm.h_computeSpgemmPart<nnz_per_thread, threads, blocks_per_mp, input_elements_per_thread, retain_elements_per_thread, merge_max_path_options, typename SEMIRING_t::leftInput_t, typename SEMIRING_t::rightInput_t, typename SEMIRING_t::output_t, IndexType, OffsetType, 0, T,U,Label, SEMIRING_t>(
+						matA.data, matA.col_ids, matA.row_offsets,
+						matB.data, matB.col_ids, matB.row_offsets,
+						blockStarts, matA.nnz, Arows,
+						tempChunkBuffers[run].get<uint32_t>(), currentChunckAllocation, currentChunckAllocation + 1, tempChunkBufferSizes[run],
+						chunckPointers.get<void*>(), currentCounters, chunkPointerSize,
+						newmat_offsets.get<OffsetType>(), outputRowListHead, outputRowChunkCounter,
+						sharedRowTracker, currentCounters + 1, avg_row_overlap, 1.0f / avg_row_overlap,
+						currentFlag, completion_status, chunk_counter, currentCounters + 2, semiring);
+					//----------------------------------------------------------
+					cudaDeviceSynchronize();
+
+					HANDLE_ERROR(cudaGetLastError());
+				}
+				else if (Bcols < (1ull << LZCNT(nnz_per_thread*threads)) - 1)
+				{
+					if(DEBUG_MODE)
+					{
+						std::cout << "Case 2:\n";
+					}
+					HANDLE_ERROR(cudaDeviceSynchronize());
+					//remap every local row to reduce bit count and use remaining for col ids
+					//----------------------------------------------------------
+					HANDLE_ERROR(cudaGetLastError());
+
+					HANDLE_ERROR(cudaDeviceSynchronize());
+					uint32_t* tempC = tempChunkBuffers[run].get<uint32_t>();
+					HANDLE_ERROR(cudaGetLastError());
+
+					void** chunckP = chunckPointers.get<void*>();
+					HANDLE_ERROR(cudaGetLastError());
+
+					OffsetType* nmat_f = newmat_offsets.get<OffsetType>();
+					HANDLE_ERROR(cudaGetLastError());
+
+                    spgemm.h_computeSpgemmPart<nnz_per_thread, threads, blocks_per_mp, input_elements_per_thread, retain_elements_per_thread, merge_max_path_options, typename SEMIRING_t::leftInput_t, typename SEMIRING_t::rightInput_t, typename SEMIRING_t::output_t, IndexType, OffsetType, true, T,U,Label, SEMIRING_t>(
+                            matA.data, matA.col_ids, matA.row_offsets,
+                            matB.data, matB.col_ids, matB.row_offsets,
+						blockStarts, matA.nnz, Arows,
+						tempC, currentChunckAllocation, currentChunckAllocation + 1, tempChunkBufferSizes[run],
+						chunckP, currentCounters, chunkPointerSize,
+						nmat_f, outputRowListHead, outputRowChunkCounter,
+						sharedRowTracker, currentCounters + 1, avg_row_overlap, 1.0f / avg_row_overlap,
+						currentFlag, completion_status, chunk_counter, currentCounters + 2, semiring);
+					//----------------------------------------------------------
+					cudaDeviceSynchronize();
+					HANDLE_ERROR(cudaGetLastError());
+				}
+				else
+				{
+					if(DEBUG_MODE)
+					{
+						std::cout << "Case 3:\n";
+					}
+					HANDLE_ERROR(cudaGetLastError());
+					cudaDeviceSynchronize();
+					//----------------------------------------------------------
+					spgemm.h_computeSpgemmPart<nnz_per_thread, threads, blocks_per_mp, input_elements_per_thread, retain_elements_per_thread, merge_max_path_options, typename SEMIRING_t::leftInput_t, typename SEMIRING_t::rightInput_t, typename SEMIRING_t::output_t, IndexType, OffsetType, 2,T,U,Label, SEMIRING_t>(
+						matA.data, matA.col_ids, matA.row_offsets,
+						matB.data, matB.col_ids, matB.row_offsets,
+						blockStarts, matA.nnz, Arows,
+						tempChunkBuffers[run].get<uint32_t>(), currentChunckAllocation, currentChunckAllocation + 1, tempChunkBufferSizes[run],
+						chunckPointers.get<void*>(), currentCounters, chunkPointerSize,
+						newmat_offsets.get<OffsetType>(), outputRowListHead, outputRowChunkCounter,
+						sharedRowTracker, currentCounters + 1, avg_row_overlap, 1.0f / avg_row_overlap,
+						currentFlag, completion_status, chunk_counter, currentCounters + 2,semiring);
+					//----------------------------------------------------------
+					cudaDeviceSynchronize();
+					HANDLE_ERROR(cudaGetLastError());
+				}
+				// if (cudaDeviceSynchronize() != cudaSuccess) {
+				// 	throw SpGEMMException();
+				// }
+				if(stats.measure_all)
+					stats.duration_spgemm += recordTimer(individual_start, individual_stop, stream);
+			}
+			else
+			{
+		
+				if(DEBUG_MODE)
+				{
+					std::cout << "################################################\n";
+					std::cout << "Start Merge Stage\n";
+				}
+				uint32_t simple_restart_offset = 0;
+				uint32_t max_chunks_restart_offset = mergeBlocks.shared_rows_simple;
+				uint32_t generalized_restart_offset = mergeBlocks.shared_rows_simple + mergeBlocks.shared_rows_max_chunks;
+				// Simple Case -> Output fits in shared
+				if (mergeBlocks.shared_rows_simple)
+				{
+					if(stats.measure_all)
+						startTimer(individual_start, mergeStreams[0]);
+
+					spgemm.setLaunchDimensions(mergeBlocks.shared_rows_simple, mergeStreams[0], threads);
+					if (Bcols < 1ull << LZCNT(threads - 1))
+					{
+						if (DEBUG_MODE)
+						{
+							std::cout << "Case: 1\n";
+						}
+						//----------------------------------------------------------
+						spgemm.h_mergeSharedRowsSimple< nnz_per_thread, threads, blocks_per_mp, input_elements_per_thread, retain_elements_per_thread, merge_max_chunks, merge_max_path_options, typename SEMIRING_t::output_t, IndexType, OffsetType, false,T,U,Label, SEMIRING_t>(
+							combineBlockOffsets.get<uint32_t>() + (3 * numSharedRows), combineBlockOffsets.get<uint32_t>(), outputRowListHead,
+							newmat_offsets.get<OffsetType>(),
+							tempChunkBuffers[run].get<uint32_t>(), currentChunckAllocation, NULL, tempChunkBufferSizes[run],
+							chunckPointers.get<void*>(), currentCounters, chunkPointerSize,
+							currentFlag, restart_completion, shared_rows_handled, simple_restart_offset, currentCounters + 2, semiring
+							);
+						//----------------------------------------------------------
+					}
+					else
+					{
+						if (DEBUG_MODE)
+						{
+							std::cout << "Case: 2\n";
+						}
+						//----------------------------------------------------------
+						spgemm.h_mergeSharedRowsSimple< nnz_per_thread, threads, blocks_per_mp, input_elements_per_thread, retain_elements_per_thread, merge_max_chunks, merge_max_path_options, typename SEMIRING_t::output_t, IndexType, OffsetType, true,T,U,Label, SEMIRING_t>(
+							combineBlockOffsets.get<uint32_t>() + (3 * numSharedRows), combineBlockOffsets.get<uint32_t>(), outputRowListHead,
+							newmat_offsets.get<OffsetType>(),
+							tempChunkBuffers[run].get<uint32_t>(), currentChunckAllocation, NULL, tempChunkBufferSizes[run],
+							chunckPointers.get<void*>(), currentCounters, chunkPointerSize,
+							currentFlag, restart_completion, shared_rows_handled, simple_restart_offset, currentCounters + 2,semiring
+							);
+						//----------------------------------------------------------
+					}
+					// if (cudaDeviceSynchronize() != cudaSuccess) {
+					// 	throw MergeSimpleCaseException();
+					// }
+					if(stats.measure_all)
+						stats.duration_merge_simple += recordTimer(individual_start, individual_stop, mergeStreams[0]);
+				}
+				HANDLE_ERROR(cudaGetLastError());
+				// Complex Case -> Output gets merged through paths over MAX_CHUNKS
+				if (mergeBlocks.shared_rows_max_chunks)
+				{
+					if (DEBUG_MODE)
+					{
+						std::cout << "Case: 4\n";
+					}
+					if(stats.measure_all)
+						startTimer(individual_start, mergeStreams[1]);
+					spgemm.setLaunchDimensions(mergeBlocks.shared_rows_max_chunks, mergeStreams[1], threads);
+					//----------------------------------------------------------
+					spgemm.h_mergeSharedRowsMaxChunks<nnz_per_thread, threads, blocks_per_mp, input_elements_per_thread, retain_elements_per_thread, merge_max_chunks, merge_max_path_options, typename SEMIRING_t::leftInput_t, IndexType, OffsetType,
+                    typename SEMIRING_t::leftInput_t, typename SEMIRING_t::rightInput_t,Label,SEMIRING_t> (
+						NULL, combineBlockOffsets.get<uint32_t>() + (1 * numSharedRows), outputRowListHead,
+						newmat_offsets.get<OffsetType>(),
+						tempChunkBuffers[run].get<uint32_t>(), currentChunckAllocation, NULL, tempChunkBufferSizes[run],
+						chunckPointers.get<void*>(), currentCounters, chunkPointerSize,
+						currentFlag, restart_completion, shared_rows_handled,
+						chunk_indices, chunk_values, chunk_multiplier,
+						chunkElementCountDataOffset, max_chunks_restart_offset, num_chunks, currentCounters + 2, semiring);
+					//----------------------------------------------------------
+					// if (cudaDeviceSynchronize() != cudaSuccess) {
+					// 	throw MergeMaxChunksCaseException();
+					// }
+					if(stats.measure_all)
+						stats.duration_merge_max += recordTimer(individual_start, individual_stop, mergeStreams[1]);
+				}
+				HANDLE_ERROR(cudaGetLastError());
+				// General Case -> Handles cases with more than MAX_CHUNKS chunks
+				if (mergeBlocks.shared_rows_generalized)
+				{
+					if (DEBUG_MODE)
+					{
+						std::cout << "Case: 5\n";
+					}
+					if(stats.measure_all)
+						startTimer(individual_start, mergeStreams[2]);
+					spgemm.setLaunchDimensions(mergeBlocks.shared_rows_generalized, mergeStreams[2], threads);
+					//----------------------------------------------------------
+					spgemm.h_mergeSharedRowsGeneralized<nnz_per_thread, threads, blocks_per_mp, input_elements_per_thread, retain_elements_per_thread,
+					generalized_merge_max_path_options, merge_max_path_options, typename SEMIRING_t::leftInput_t, IndexType, OffsetType,
+					  T,U,Label,SEMIRING_t>(
+						NULL, combineBlockOffsets.get<uint32_t>() + (2 * numSharedRows), outputRowListHead,
+						newmat_offsets.get<OffsetType>(),
+						tempChunkBuffers[run].get<uint32_t>(), currentChunckAllocation, NULL, tempChunkBufferSizes[run],
+						chunckPointers.get<void*>(), currentCounters, chunkPointerSize,
+						currentFlag, restart_completion, shared_rows_handled,
+						sample_offset, chunkElementConsumedAndPath, generalized_restart_offset, currentCounters + 2,
+						semiring
+						);
+					//----------------------------------------------------------
+					// if (cudaDeviceSynchronize() != cudaSuccess) {
+					// 	throw MergeGeneralizedCaseException();
+					// }
+					if(stats.measure_all)
+						stats.duration_merge_generalized += recordTimer(individual_start, individual_stop, mergeStreams[2]);
+				}
+			}
+			//HANDLE_ERROR(cudaGetLastError());
+			// // Copy back flags
+			/*cudaPointerAttributes attr;
+			for (int i = 0; i < numFlags+numCounters; ++i) {
+				HANDLE_ERROR(cudaPointerGetAttributes(&attr, chunckAllocations + chunckAllocationsSize + i));
+				uint32_t test = flagsAndListAllocCounters[i];
+				if(attr.type != 2) std::cout << attr.type << std::endl;
+			}*/
+			
+			//std::cout << "FLAG COPY " << id << std::endl;
+			/*for (int i = 0; i < (numFlags + numCounters); ++i)
+			{
+				cudaPointerAttributes attr;
+				cudaError_t error = cudaPointerGetAttributes(&attr, chunckAllocations + chunckAllocationsSize + i);
+
+				if (error != cudaSuccess)
+				{
+					std::cerr << "Error getting pointer attributes: "
+							  << cudaGetErrorString(error) << std::endl;
+				}
+				else if (attr.type != cudaMemoryTypeDevice && id == 0)
+				{
+					std::cout << "ERR" << std::endl; // This should never happen
+				}
+			}*/
+			//MPI_Barrier(MPI_COMM_WORLD); // Synchronize after CUDA operations
+			HANDLE_ERROR(cudaGetLastError());
+			HANDLE_ERROR(cudaMemcpy(flagsAndListAllocCounters, chunckAllocations + chunckAllocationsSize, (numFlags + numCounters) * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+			//MPI_Barrier(MPI_COMM_WORLD); // Synchronize after CUDA operations
+			//std::cout << "FLAG COPY DONE " << id << std::endl;
+			completed = flagsAndListAllocCounters[run + chunk_pointer_restart_run] == 0;
+			
+			if (!completed)
+			{
+				// if (stats.measure_all && stats.duration_merge_simple + stats.duration_merge_max + stats.duration_merge_generalized > 10000)
+				// 	throw MergeLoopingException();
+
+
+				uint32_t return_value = flagsAndListAllocCounters[run + chunk_pointer_restart_run];
+				if (UintBitSet(return_value).test(0))
+				{
+					if (DEBUG_MODE)
+					{
+						std::cout << "Chunk Memory Restart allocating space for " << tempChunkBufferSizes[run] / (sizeof(typename SEMIRING_t::rightInput_t) + sizeof(IndexType)) << " elements\n";
+					}
+					// Get more chunk memory
+					auto new_buffer_size = tempChunkBufferSizes[run];
+					tempChunkBufferSizes[run+1] = new_buffer_size;
+					tempChunkBuffers[run+1] = CU::allocMemory(new_buffer_size);
+					if (++run == chunckAllocationsSize / 2) {
+						std::cout << "Out of memory " << std::endl; 
+						throw RestartOutOfMemoryException();
+					}
+					HANDLE_ERROR(cudaGetLastError());
+				}
+				if (UintBitSet(return_value).test(1))
+				{
+					if (DEBUG_MODE)
+					{
+						std::cout << "Chunk Pointer Restart allocating " << targetChunkPointerSize << " new pointers\n";
+					}
+					// Get more chunk pointers
+					chunkPointerSize += targetChunkPointerSize;
+					chunckPointers.increaseMemRetainData((targetChunkPointerSize) * 8);
+					targetChunkPointerSize *= 2;
+					if (++chunk_pointer_restart_run == chunckAllocationsSize / 2)
+						throw RestartOutOfChunkPointerException();
+					HANDLE_ERROR(cudaMemcpy(currentCounters, currentCounters + 2, sizeof(uint32_t), cudaMemcpyDeviceToDevice));
+				}
+			}
+			if (completed && !rowmerging)
+			{
+				numSharedRows = flagsAndListAllocCounters[numFlags + 1];
+				if (numSharedRows > 0)
+				{
+					if(stats.measure_all)
+						startTimer(individual_start, stream);
+
+					if (combineBlockOffsetsSize < 4 * (numSharedRows + 1))
+					{
+						combineBlockOffsetsSize = 4 * (numSharedRows + 1024);
+						combineBlockOffsets.assure(combineBlockOffsetsSize * sizeof(IndexType));
+						memory_usage_in_Bytes += combineBlockOffsetsSize * sizeof(IndexType);
+					}
+					CUdeviceptr mergeTypeCounters = reinterpret_cast<CUdeviceptr>(chunckAllocations) + 4 * (chunckAllocationsSize + numFlags + numCounters);
+
+					//----------------------------------------------------------
+					mergeBlocks = spgemm.assignCombineBlocks<IndexType, merge_max_chunks, 2 * threads * input_elements_per_thread, threads>(numSharedRows, prefixSumTemp, prefixSumTempMemSize, sharedRowTracker, newmat_offsets, outputRowChunkCounter, combineBlockOffsets, mergeTypeCounters, stream);
+					//----------------------------------------------------------
+
+					completed = false;
+					rowmerging = true;
+
+					if(DEBUG_MODE)
+					{
+						std::cout << "################################################\n";
+						std::cout << "Assigned " << numSharedRows << " shared rows to blocks, starting \n\t\t"
+							<< mergeBlocks.shared_rows_simple << " simple merges for " << mergeBlocks.shared_rows_simple_rows << " rows,\n\t\t"
+							<< mergeBlocks.shared_rows_max_chunks << " max chunk mergers, and\n\t\t"
+							<< mergeBlocks.shared_rows_generalized << " general mergers\n";
+					}
+
+					// Set merge stage row stats
+					stats.shared_rows = numSharedRows;
+					stats.simple_mergers = mergeBlocks.shared_rows_simple;
+					stats.simple_rows = mergeBlocks.shared_rows_simple_rows;
+					stats.complex_rows = mergeBlocks.shared_rows_max_chunks;
+					stats.generalized_rows = mergeBlocks.shared_rows_generalized;
+					merge_simple_rows = mergeBlocks.shared_rows_simple;
+					merge_max_chunks_rows = mergeBlocks.shared_rows_max_chunks;
+					merge_generalized_rows = mergeBlocks.shared_rows_generalized;
+
+					// Allocate memory for all helper data structures
+					size_t combinedMergeStageMemory_size =
+						/*shared_rows_handled*/((numSharedRows) * sizeof(uint32_t)) +
+						/*restart_completion*/((numSharedRows) * sizeof(uint32_t)) +
+						/*chunkElementConsumedAndPath*/((numSharedRows) * (generalized_merge_max_path_options + helper_overhead) * sizeof(uint32_t)) +
+						/*chunkElementCountDataOffset*/(((numSharedRows) * merge_max_chunks) * sizeof(uint32_t)) +
+						/*num_chunks*/((numSharedRows) * sizeof(uint32_t)) +
+						/*sample_offset*/(((numSharedRows) * (threads) * sizeof(uint32_t))); //+
+						///* chunk_indices*/(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(IndexType*)) +
+						///*chunk_values*/(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(typename SEMIRING_t::input_t*)) +
+						///*chunk_multiplier*/(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(typename SEMIRING_t::input_t));
+					combinedMergeStageMemory.assure(combinedMergeStageMemory_size);
+					memory_usage_in_Bytes += combinedMergeStageMemory_size;
+					HANDLE_ERROR(cudaGetLastError());
+					//// Place pointers in memory allocation
+					shared_rows_handled = combinedMergeStageMemory.get<uint32_t>();
+					restart_completion = shared_rows_handled + (numSharedRows);
+					chunkElementConsumedAndPath = restart_completion + (numSharedRows);
+					chunkElementCountDataOffset = chunkElementConsumedAndPath + (numSharedRows) * (generalized_merge_max_path_options + helper_overhead);
+					num_chunks = chunkElementCountDataOffset + ((numSharedRows) * merge_max_chunks);
+					sample_offset = num_chunks + (numSharedRows);
+
+					// TODO: Why does this work??????????????????????????
+					chunk_indices_cptr.assure(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(IndexType*));
+					chunk_indices = chunk_indices_cptr.get<IndexType*>();
+					chunk_values_cptr.assure(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof( Either<typename SEMIRING_t::rightInput_t*, typename SEMIRING_t::output_t*>));
+					chunk_values = chunk_values_cptr.get< Either<typename SEMIRING_t::rightInput_t*, typename SEMIRING_t::output_t*>>();
+					chunk_multiplier_cptr.assure(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(typename SEMIRING_t::leftInput_t));
+					chunk_multiplier = chunk_multiplier_cptr.get<typename SEMIRING_t::leftInput_t>();
+
+
+					// TODO: Why does this NOT work??????????????????????????
+					/*chunk_indices = reinterpret_cast<IndexType**>(chunk_multiplier + ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks));*/
+					/*chunk_values = reinterpret_cast<typename SEMIRING_t::input_t**>(chunk_indices + ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks));*/
+					// chunk_multiplier = reinterpret_cast<typename SEMIRING_t::input_t*>(sample_offset + ((numSharedRows) * (threads)));
+
+					memory_usage_in_Bytes += ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(IndexType*);
+					memory_usage_in_Bytes += ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(Either<typename SEMIRING_t::rightInput_t*, typename SEMIRING_t::output_t*>);
+					memory_usage_in_Bytes += ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(typename SEMIRING_t::rightInput_t);
+
+					if (numSharedRows > lastSharedRows)
+					{
+						cudaMemset(combinedMergeStageMemory.get(), 0,
+							/*chunkElementConsumedAndPath*/((numSharedRows) * (generalized_merge_max_path_options + helper_overhead) * sizeof(uint32_t)) +
+							/*shared_rows_handled*/((numSharedRows) * sizeof(uint32_t)) +
+							/*restart_completion*/((numSharedRows) * sizeof(uint32_t))
+						);
+						lastSharedRows = numSharedRows;
+					}
+					if(stats.measure_all)
+						stats.duration_merge_case_computation = recordTimer(individual_start, individual_stop, stream);
+					HANDLE_ERROR(cudaGetLastError());
+				}
+			}
+		} while (!completed);
+		//std::cout << "WE OUT" << std::endl;
+		//delete[] flagsAndListAllocCounters;
+		// Let's write the chunks out to a csr matrix
+		if(stats.measure_all)
+			startTimer(individual_start, stream);
+
+		//----------------------------------------------------------
+		spgemm.computeRowOffsets<IndexType>(Crows, prefixSumTemp, prefixSumTempMemSize, newmat_offsets, stream);
+		//----------------------------------------------------------
+
+		// Allocate output matrix
+		IndexType matrix_elements;
+		CUdeviceptr offs = newmat_offsets;
+		offs += sizeof(IndexType) * Crows;
+		HANDLE_ERROR(cudaMemcpy(&matrix_elements, reinterpret_cast<void*>(offs), sizeof(IndexType), cudaMemcpyDeviceToHost));
+
+		if (matOut.nnz != matrix_elements)
+		{
+			//std::cout << "Reallocation HERE ################" << matOut.nnz << " | " << matrix_elements <<"\n";
+			matOut.alloc(Crows, Ccols, matrix_elements, false);
+		}
+		matOut.row_offsets = std::move(newmat_offsets.getRelease<IndexType>());
+
+		//----------------------------------------------------------
+		spgemm.h_copyChunks<typename SEMIRING_t::output_t, IndexType, OffsetType>(chunckPointers.get<void*>(), currentCounters,
+			matOut.data, matOut.col_ids, matOut.row_offsets);
+		//----------------------------------------------------------
+		if(stats.measure_all)
+			stats.duration_write_csr = recordTimer(individual_start, individual_stop, stream);
+
+		if (stats.measure_all)
+		{
+			stats.mem_allocated_chunks = tempChunkBufferSizes[0] * (run + 1);
+			uint32_t* d_current_chunk_allocation = chunckAllocations + (2 * run);
+			uint32_t h_current_chunk_allocation = 0;
+			HANDLE_ERROR(cudaMemcpy(&h_current_chunk_allocation, d_current_chunk_allocation, sizeof(uint32_t), cudaMemcpyDeviceToHost));
+			stats.mem_used_chunks = tempChunkBufferSizes[0] * run + h_current_chunk_allocation;
+		}
+		stats.restarts = run + chunk_pointer_restart_run;
+		HANDLE_ERROR(cudaGetLastError());
+		// ##############################
+		stats.duration = recordTimer(ce_start, ce_stop, stream);
+		// ##############################
+
+		// Stream cleanup
+		if (!(stats.measure_all))
+		{
+			for (int i = 0; i < number_merge_streams; ++i)
+				cudaStreamDestroy(mergeStreams[i]);
+		}
+		HANDLE_ERROR(cudaGetLastError());	
+		return;
+	}
+
+	template<class CB, int... OPTIONS>
+	struct Selection
+	{
+		CB& cb;
+		Selection(CB& cb) : cb(cb) {}
+	};
+
+	template<class CB, int... OPTIONS>
+	struct CallSelection
+	{
+		static void call(CB &cb)
+		{
+			cb. template call<OPTIONS...>();
+		}
+	};
+
+	struct EnumFin
+	{
+		template<class CB, int... OPTIONS>
+		static bool call(Selection<CB, OPTIONS...> cb)
+		{
+			CallSelection<CB, OPTIONS...>::call(cb.cb);
+			return true;
+		}
+	};
+
+	template<int CURRENT, int MAX, int STEP, class NEXT = EnumFin>
+	struct EnumOption
+	{
+		template<class CB,int... OPTIONS, class... TYPES>
+		static bool call(Selection<CB, OPTIONS...> cb, int value, TYPES... values)
+		{
+			if (value == CURRENT)
+			{
+				return NEXT::call(Selection<CB, OPTIONS..., CURRENT>(cb.cb), values...);
+			}
+			else
+				return EnumOption<CURRENT + STEP, MAX, STEP, NEXT>::call(cb, value, values...);
+		}
+	};
+
+	template<int MAX, int STEP, class NEXT>
+	struct EnumOption<MAX, MAX, STEP, NEXT>
+	{
+		template<class CB, int... OPTIONS, class... TYPES>
+		static bool call(Selection<CB, OPTIONS...> cb, int value, TYPES... values)
+		{
+			if (value == MAX)
+			{
+				return NEXT::call(Selection<CB, OPTIONS..., MAX>(cb.cb), values...);
+			}
+			else
+				return false;
+		}
+	};
+
+
+	template<typename DataType ,  typename T, typename U, typename Label,
+            typename SEMIRING_t>
+            struct MultiplyCall
+	{
+		const dCSR<typename SEMIRING_t::leftInput_t>& A;
+		const dCSR<typename SEMIRING_t::rightInput_t>& B;
+		dCSR<typename SEMIRING_t::output_t> &matOut;
+		SEMIRING_t semiring;
+
+		const GPUMatrixMatrixMultiplyTraits& scheduling_traits;
+		ExecutionStats& exec_stats;
+
+		MultiplyCall(const dCSR<typename SEMIRING_t::leftInput_t>& A, const dCSR<typename SEMIRING_t::rightInput_t>& B, dCSR<typename SEMIRING_t::output_t>& matOut, const GPUMatrixMatrixMultiplyTraits& scheduling_traits, ExecutionStats& exec_stats, SEMIRING_t semiring) :
+			A(A), B(B), matOut(matOut), scheduling_traits(scheduling_traits), exec_stats(exec_stats), semiring(semiring)
+		{
+
+		}
+
+		template<int Threads, int BlocksPerMP, int NNZPerThread, int InputPerThread, int RetainElements, int MaxChunkstoMerge, int MaxChunksGeneralizedMerge, int MergePathOptions, int Debug>
+		void call()
+		{
+			const int RealBlocksPerMP = (256 * BlocksPerMP + Threads - 1) / Threads;
+			ACSpGEMM::MultiplyImplementation<typename SEMIRING_t::leftInput_t, Threads, RealBlocksPerMP, NNZPerThread, InputPerThread, RetainElements, MaxChunkstoMerge, MaxChunksGeneralizedMerge, MergePathOptions, Debug == 0?false:true, T,U,Label, SEMIRING_t>(A, B, matOut, scheduling_traits, exec_stats,semiring);
+		}
+	};
+
+
+	template < typename SEMIRING_t>
+	        void Multiply(const dCSR<typename SEMIRING_t::leftInput_t>& A, const dCSR<typename SEMIRING_t::rightInput_t>& B, dCSR<typename SEMIRING_t::output_t>& matOut, const GPUMatrixMatrixMultiplyTraits& scheduling_traits, ExecutionStats& exec_stats, bool DEBUG_MODE, SEMIRING_t semiring)
+	{
+		HANDLE_ERROR(cudaGetLastError());
+		MultiplyCall<typename SEMIRING_t::leftInput_t,typename SEMIRING_t::rightInput_t,typename SEMIRING_t::output_t,typename SEMIRING_t::output_t, SEMIRING_t> call(A, B, matOut, scheduling_traits, exec_stats, semiring);
+		HANDLE_ERROR(cudaGetLastError());
+	
+	bool called = EnumOption<128, 256, 128,
+	EnumOption<1, 1, 1,
+	EnumOption<2, 2,2,
+	EnumOption<2, 2, 2,
+	EnumOption<1, 1, 1,
+	EnumOption<16, 16, 8,
+	EnumOption<512, 512, 256,
+	EnumOption<8, 8, 8,
+	EnumOption<0, 1, 1>>>>>>>>>
+			::call(Selection<MultiplyCall<typename SEMIRING_t::leftInput_t, typename SEMIRING_t::rightInput_t, typename SEMIRING_t::output_t, typename SEMIRING_t::output_t, SEMIRING_t>>(call), scheduling_traits.Threads, scheduling_traits.BlocksPerMp, scheduling_traits.NNZPerThread, scheduling_traits.InputElementsPerThreads, scheduling_traits.RetainElementsPerThreads, scheduling_traits.MaxChunksToMerge, scheduling_traits.MaxChunksGeneralizedMerge, scheduling_traits.MergePathOptions, (int)DEBUG_MODE);
+		if(!called)
+		{
+			std::cout << "Configuration not instantiated!\n";
+		}
+	};
+}
+ 
diff --git a/include/GALATIC/source/device/memory.cuh b/include/GALATIC/source/device/memory.cuh
new file mode 100644
index 00000000..2103e0a2
--- /dev/null
+++ b/include/GALATIC/source/device/memory.cuh
@@ -0,0 +1,63 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+#pragma once
+
+#include "../../include/devicetools/error.h"
+#include "../../include/devicetools/memory.h"
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+
+
+namespace CU
+{
+	unique_ptr allocMemory(std::size_t size)
+	{
+		CUdeviceptr ptr;
+		cudaMalloc(reinterpret_cast<void**>(&ptr), size);
+		return unique_ptr(ptr);
+	}
+	
+	unique_ptr allocMemoryPitched(std::size_t& pitch, std::size_t row_size, std::size_t num_rows, unsigned int element_size)
+	{
+		CUdeviceptr ptr;
+		cudaMallocPitch(reinterpret_cast<void**>(&ptr), &pitch, row_size, num_rows);
+		return unique_ptr(ptr);
+	}
+	
+	pitched_memory allocMemoryPitched(std::size_t row_size, std::size_t num_rows, unsigned int element_size)
+	{
+		CUdeviceptr ptr;
+		std::size_t pitch;
+		cudaMallocPitch(reinterpret_cast<void**>(&ptr), &pitch, row_size, num_rows);
+		return pitched_memory(unique_ptr(ptr), pitch);
+	}
+}
diff --git a/include/GALATIC/source/main.cu b/include/GALATIC/source/main.cu
new file mode 100644
index 00000000..616951c2
--- /dev/null
+++ b/include/GALATIC/source/main.cu
@@ -0,0 +1,436 @@
+//  Project AC-SpGEMM
+//  https://www.tugraz.at/institute/icg/research/team-steinberger/
+//
+//  Copyright (C) 2018 Institute for Computer Graphics and Vision,
+//                     Graz University of Technology
+//
+//  Author(s):  Martin Winter - martin.winter (at) icg.tugraz.at
+//              Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at
+//              Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de
+//              Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de
+//              Markus Steinberger - steinberger ( at ) icg.tugraz.at
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to deal
+//  in the Software without restriction, including without limitation the rights
+//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//  copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+//  THE SOFTWARE.
+//
+
+/*!/------------------------------------------------------------------------------
+ * Main.cpp
+ *
+ * ac-SpGEMM
+ *
+ * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter
+ *------------------------------------------------------------------------------
+*/
+
+// Global includes
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <random>
+#include <algorithm>
+#include <string>
+#include <tuple>
+#include <cuda_runtime.h>
+#include <execinfo.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+
+
+// Local includes
+#include "CSR.cuh"
+#include "COO.cuh"
+#include "Vector.h"
+#include "dCSR.cuh"
+#include "dVector.h"
+#include "device/Multiply.cuh"
+#include "Transpose.h"
+#include "Compare.cuh"
+#include "CPU_SpGEMM.h"
+// CuSparse include
+//#include "cusparse/include/cuSparseMultiply.h"
+
+// // Nsparse include
+// #ifndef NONSPARSE
+// #include "nsparse/include/nsparseMultiply.h"
+// #endif
+
+// // RMerge include
+// #ifndef NORMERGE
+// #include "RMerge/include/rmergeMultiply.h"
+// #endif
+// const uint64_t max(uint64_t x, uint64_t y) {
+// 	return x < y ? x :y;
+// }
+// // BhSparse include
+// #ifndef NOBHSPARSE
+// #include"bhSparse/include/bhSparseMultiply.h"
+// #endif
+
+//foo::foo(int x) {
+//    this->a =x;
+//
+//}
+template<typename T>
+void log_good(T& s) {
+	std::cout << "\033[1;31," << s << "\033[0m";
+}
+
+struct triv {};
+
+struct mr2 {
+    int16_t val;
+    uint8_t  temp;
+	uint8_t  temp2;
+
+	uint8_t  temp3;
+
+	uint8_t  temp4;
+
+	uint8_t  temp5;
+
+};
+
+struct MinRing : SemiRing<MinRing, mr2 , triv> {
+    int16_t val;
+	int16_t val2;
+
+
+    // __device__ __host__ MinRing(int32_t x, int32_t y) {
+    //     val = x;
+    // }
+	// __device__ __host__ MinRing(int32_t x) {
+    //     val = x;
+    // }
+
+    // __device__ __host__ ~MinRing() {
+    // }
+
+    // __device__ __host__ MinRing() {
+    //      val = INT16_MIN;
+    // }
+
+	static MinRing Init(double x) {
+		MinRing res;
+		res.val = (short) x;
+		return res;
+	}
+   __device__ __host__ mr2 multiply( MinRing & a,  MinRing & b) const {
+        return mr2 { static_cast<short>(a.val == INT16_MAX || b.val == INT16_MAX ? INT16_MAX :  a.val + b.val ),0};
+    }
+    __device__ __host__ mr2  add(const mr2 & a, const mr2 & b)const  {
+
+        return mr2 { a.val < b.val ? a.val : b.val,0} ;
+    }
+
+    __device__ bool operator==(const MinRing& rhs) const
+    {
+        return val == rhs.val;
+    }
+
+    static __host__ __device__ MinRing MultiplicativeIdentity() {
+        MinRing result;
+        result.val = 0;
+        return result;
+    }
+    static __host__ __device__ mr2 AdditiveIdentity() {
+        return mr2  { INT16_MAX ,0};
+    }
+};
+
+
+
+
+
+
+unsigned int padding = 0;
+template<typename T>
+std::string typeext() {
+    //FIXME not-C++ standard compliant
+    return typeid(T).name();
+}
+template<> 
+std::string typeext<float>()
+{
+	return std::string("");
+}
+template<> std::string typeext<uint32_t>()
+{
+    return std::string("i32_");
+}
+template<> 
+std::string typeext<double>()
+{
+	return std::string("d_");
+}
+
+void printCheckMark()
+{
+	printf("\n        #\n       #\n      #\n #   #\n  # #\n   #\n\n");
+}
+
+void printCross()
+{
+	printf("\n #     # \n  #   #  \n   # #   \n    #    \n   # #   \n  #   #  \n #     # \n\n");
+}
+
+int main(int argc, char *argv[])
+{
+
+
+	std::cout << "########## ac-SpGEMM ##########" << std::endl;
+
+	char  *filename;
+	bool print_stats{ false };
+	if (argc == 1)
+	{
+		std::cout << "Require filename of .mtx as first argument" << std::endl;
+		return -1;
+	}
+
+	filename = argv[1];
+
+	 int device = 0;
+	// if (argc >= 3)
+	// 	device = std::stoi(argv[2]);
+	
+	 bool testing = false;
+	// if(argc >= 4)
+	// 	testing = std::stoi(argv[3]) > 0 ? true : false;
+
+	cudaSetDevice(device);
+	cudaDeviceProp prop;
+	cudaGetDeviceProperties(&prop, device);
+	std::cout << "Going to use " << prop.name << " " << prop.major << "." << prop.minor << "\n";
+
+	// CSR matrices on the device
+	CSR<MinRing::input_t> csr_mat, csr_T_mat, result_mat, test_mat;
+	dCSR<MinRing::input_t> dcsr_mat, dcsr_T_mat ;//, d_nsparse_result_mat, d_rmerge_result_mat, d_bhSparse_result_mat;
+
+	dCSR<MinRing::output_t> d_result_mat_comp, d_result_mat;
+	//try load csr file
+	std::string csr_name = std::string(argv[1]) + typeext<MinRing::input_t>() + ".hicsr";
+	try
+	{
+		std::cout << "trying to load csr file \"" << csr_name << "\"\n";
+		csr_mat = loadCSR<MinRing::input_t>(csr_name.c_str());
+		std::cout << "succesfully loaded: \"" << csr_name << "\"\n";
+	}
+	catch (std::exception& ex)
+	{
+		std::cout << "could not load csr file:\n\t" << ex.what() << "\n";
+		try
+		{
+			std::cout << "trying to load mtx file \"" << argv[1] << "\"\n";
+			COO<MinRing::input_t> coo_mat= loadMTX<MinRing::input_t>(argv[1]);
+			// coo_mat.alloc(2,2,4);
+            // coo_mat.data[0]= MinRing::Init(1);
+            // coo_mat.data[1]= MinRing::Init(2);
+            // coo_mat.data[2]= MinRing::Init(3);
+            // coo_mat.data[3]= MinRing::Init(4);
+
+
+            // coo_mat.row_ids[0] = 0;
+            // coo_mat.col_ids[0] = 0;
+
+            // coo_mat.row_ids[1] = 0;
+            // coo_mat.col_ids[1] = 1;
+
+
+            // coo_mat.row_ids[2] = 1;
+            // coo_mat.col_ids[2] = 0;
+
+            // coo_mat.row_ids[3] = 1;
+            // coo_mat.col_ids[3] = 1;
+
+
+
+
+            convert(csr_mat, coo_mat);
+			std::cout << "succesfully loaded and converted: \"" << csr_name << "\"\n";
+		}
+		catch (std::exception& ex)
+		{
+			std::cout << ex.what() << std::endl;
+			return -1;
+		}
+		try
+		{
+			std::cout << "write csr file for future use\n";
+			storeCSR(csr_mat, csr_name.c_str());
+		}
+		catch (std::exception& ex)
+		{
+			std::cout << ex.what() << std::endl;
+		}
+	}
+
+	// Convert host csr to device csr
+	convert(dcsr_mat, csr_mat, padding);
+
+
+
+	bool transpose = (dcsr_mat.rows != dcsr_mat.cols);
+	if (transpose)
+	{
+		std::cout << "Matrix not square (" << dcsr_mat.rows << "x" << dcsr_mat.cols << ") - Calculate Transpose!\n";
+		/*ACSpGEMM::Transpose(dcsr_mat, dcsr_T_mat);*/
+		convert(csr_T_mat, dcsr_T_mat, padding);
+	}
+
+	printf("Input Matrix A: (%zu x %zu) - NNZ: %zu\n", dcsr_mat.rows, dcsr_mat.cols, dcsr_mat.nnz);
+	if(transpose)
+		printf("Input Matrix B: (%zu x %zu) - NNZ: %zu\n", dcsr_T_mat.rows, dcsr_T_mat.cols, dcsr_T_mat.nnz);
+
+
+
+
+	const int Threads = 128;
+	const int BlocksPerMP = 1;
+	const int NNZPerThread = 2;
+	const int InputElementsPerThreads = 2;
+	const int RetainElementsPerThreads = 1;
+	const int MaxChunksToMerge = 8;
+	const int MaxChunksGeneralizedMerge = 512; // MAX: 865
+	const int MergePathOptions = 8;
+
+	GPUMatrixMatrixMultiplyTraits DefaultTraits(Threads, BlocksPerMP, NNZPerThread, InputElementsPerThreads, RetainElementsPerThreads, MaxChunksToMerge, MaxChunksGeneralizedMerge, MergePathOptions); // DefaultTraits(128, 2, 4, 1, 8, 128, 8);
+	const bool Debug_Mode = true;
+	bool checkBitStability{true};
+	DefaultTraits.preferLoadBalancing = true;
+	ExecutionStats stats, warmupstats, output_stats;
+	stats.measure_all = false;
+	output_stats.measure_all = false;
+
+	uint32_t warmupiterations = testing ? checkBitStability ? 1 : 0: 20;
+	uint32_t iterations = testing ? 1 : 20;
+
+	
+
+		// Multiplication
+			/*if (testing)
+				std::cout << "Iteration: " << i + 1 << "\n";*/
+            MinRing j = MinRing { };
+
+			std::cout << "Performing SpGEMM, GPU" << std::endl;
+            ACSpGEMM::Multiply<MinRing>(dcsr_mat, transpose ? dcsr_T_mat : dcsr_mat, d_result_mat, DefaultTraits, stats, Debug_Mode,j);
+			std::cout << "SpGEMM Done\n";
+
+			CSR<MinRing::output_t> out;
+			std::cout << "Performing SpGEMM, CPU" << std::endl;
+
+			Mult_CPU(csr_mat, csr_mat,  out, j);
+			std::cout << "CPU-SpGEMM Done\n";
+
+			std::ofstream log_f;
+
+
+			if(argc >= 3)
+			{
+				log_f.open(argv[2]);
+			}
+
+				
+
+                CSR<MinRing::output_t> coo_mat;
+
+                convert(coo_mat, d_result_mat,0);
+                COO<MinRing::input_t> coo;
+                cudaDeviceSynchronize();
+
+				uint64_t err_count = 0;
+				uint64_t checked = 0;
+				if (coo_mat.nnz != out.nnz) {
+					if (argc >= 3) {
+						log_f  << "ERROR:" << "nonzeros GPU: " << coo_mat.nnz << " vs non-zeros cpu:" << out.nnz <<std::endl;
+					}
+				    	std::cout << red << "ERROR:" << "nonzeros GPU: " << coo_mat.nnz << " vs non-zeros cpu:" << out.nnz <<std::endl;
+
+				}
+				if (argc >= 3) {
+					for (int i =0; i < coo_mat.nnz; i++) {
+		
+						if (coo_mat.data[i].val != out.data[i].val){
+							log_f  << "ERROR, NNZ Entry#: " << i << " (" << coo_mat.row_offsets[i] << ", " << coo_mat.col_ids[i] << ") gpu: "  << coo_mat.data[i].val << " vs  CPU: " << out.data[i].val << std::endl;
+							err_count++;
+						} else {
+							checked++;// this can be calulated from errocount, but I'm being paranoid to make sure we don't trivially pass
+						}
+					}
+
+
+					log_f  << "Total errors: " <<err_count <<std::endl;
+
+
+				} else {
+					for (int i =0; i < coo_mat.nnz; i++) {
+		
+						if (coo_mat.data[i].val != out.data[i].val){
+							std::cout  << red << "ERROR, NNZ Entry#: " << i << "  ("  << coo_mat.row_offsets[i] << ", " << coo_mat.col_ids[i] << ") gpu: "  << coo_mat.data[i].val << " vs  CPU: " << out.data[i].val << std::endl;
+							err_count++;
+						} else {
+							checked++;
+						}
+					}
+
+				}
+
+
+				
+			if(argc >= 3)
+			{
+				log_f << "output NNZ checked" << coo_mat.nnz << std::endl;
+				std::cout << "NNZ correct / # of checked output:  " <<  checked << "/" << coo_mat.nnz  << std::endl;
+
+				log_f.close();
+
+			}
+
+				std::cout << "Total errors: " << err_count <<std::endl;
+
+
+				std::cout << "NNZ correct / # of checked output/ total:  " <<  checked << "/" << coo_mat.nnz  << "/" << max(coo.nnz,out.nnz ) << std::endl;
+
+//
+//				if (!ACSpGEMM::Compare<MinRing::input_t>(d_result_mat_comp, d_result_mat, true))
+//				{
+//					printf("NOT Bit-Identical\n");
+//					printCross();
+//					exit(-1);
+//				}
+//				else
+//				{
+//					printf("Bit-Identical\n");
+//					printCheckMark();
+//				}
+
+		
+
+
+	// output_stats.normalize();
+	// std::cout << output_stats;
+	// std::cout << "-----------------------------------------------\n";
+
+	// if(checkBitStability)
+	// 	return 0;
+
+	return 0;
+}
+
+
diff --git a/mfiles/betwCentrality.m b/mfiles/betwCentrality.m
index 94d2b992..f32c23f5 100644
--- a/mfiles/betwCentrality.m
+++ b/mfiles/betwCentrality.m
@@ -1,266 +1,266 @@
-function bc = betwCentrality( G, K4approx, batchSize )
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-% Function betwCentrality() - Kernel 4, analyze a graph's connectivity.
-%
-% The fourth computational kernel computes the betweenness centrality for 
-% an unweighted graph, using only matrix operations. Betweenness centrality 
-% is a measure of the importance of a vertex with respect to the shortest 
-% paths between other vertices in the graph that it lies on.  This function 
-% computes an ordered list of centralities, each centrality corresponding 
-% to a specific vertex in the graph. 
-%
-% The high computational cost of kernel 4:  
-% An exact implementation would consider all of the vertices as starting 
-% points in the betweenness centrality metric; this implementation 
-% can be 'dialed' to use a subset of starting vertices to obtain an
-% approximation of the betweenness centrality.
-%
-% For a detailed description of the SCCA #2 graph analysis algorithm, 
-% please see SCCA #2 Graph Analysis Written Specification, V2.2.
-%
-% NOTES: 
-%
-% This code is the vectorized version of the pseudo-code provided in the
-% specification.  It is designed to process a full level in the search tree 
-% at a time rather than just a single vertex.  All of the operations are 
-% performed in he same way, only this code is able to perform them in 
-% parallel using sparse matrices and matrix operations.  In addition, 
-% rather than processing a single vertex at a time, it has a configurable 
-% batch size parameter.  While increasing the size of a batch increases the 
-% space required by the algorithm, it may also increase the performance.
-%
-% This uses Ulrik Brandes' Algorithm from "A faster algorithm for
-% betweenness centrality", where variables are named in the following way:
-%
-%   Ulrik Brandes                     This Code
-%   -----------------------------------------------------------------------
-%   C_B                               bc
-%   P, d                              (unused)
-%   S, Q                              bfs
-%   s                                 batch
-%   sigma                             nsp
-%   delta                             bcu
-%
-% S and Q can be stored using the same variable.  This optimization can be
-% performed in the original algorithm as well by simply using a vector for
-% storage rather than a stack and a queue.  Instead of discarding vertices
-% from the top of Q, the vector pointer is advanced.  The stack S
-% corresponds to the vertices of the array in reverse order.
-%
-% bfs is stored as a matrix rather than a vector.  Rather than looking at a
-% single vertex at a time, all vertices at a particular depth are examined.
-%
-% D is not required.  It was used previously to determine the
-% distance between two vertices.  In this implementation, this can be
-% computed by looking at bfs.  In addition, since all the nodes at a 
-% particular depth in the search are examined at the same time, all 
-% previously unseen vertices must be on shortest paths.
-%
-% P is computed rather than stored by selecting edges that go between
-% vertices at neighboring depths.
-%
-% References:
-%
-% D.A. Bader and K. Madduri, "Parallel Algorithms for Evaluating Centrality 
-% Indices in Real-world Networks",  Proc. The 35th International Conference 
-% on Parallel Processing (ICPP), Columbus, OH, August 2006.
-%
-% Ulrik Brandes, "A faster algorithm for betweenness centrality". Journal 
-% of Mathematical Sociology, 25(2):163�177, 2001.
-%
-% L.C. Freeman,  "A set of measures of centrality based on betweenness". 
-% Sociometry, 40(1):35�41, 1977.
-%
-%
-% INPUT
-%
-% G.          - [struct] graph (from kernel 1).
-%   adjMatrix - sparse weighted adjacency matrix of the graph.
-% K4approx    - [int] binary exponent of the number of times that the 
-%               algorithm is to loop, between 1 and SCALE. This 
-%               simplification reduces its computational time from O(MN) 
-%               to O(M*2^K4approx), which is important when testing large 
-%               graphs. It determines the amount of work performed by   
-%               kernel 4. When 'K4approx' equals 'SCALE', this 
-%               implementation is exact.  Otherwise, distinct vertices  
-%               are selected randomly (user).
-% batchSize   - [int] the number of vertices to process at once.  The space
-%               required by the algorithm increases linearly in this
-%               parameter.  While there is no theoretical decrease in
-%               runtime by increasing this parameter, in actual
-%               implementations performance may increase due to batch
-%               processing of the operations.
-%
-% OUTPUT
-%
-% bc          - [1D array, float] Betweenness centrality is a measure of  
-%               the importance of a vertex with respect to the shortest 
-%               paths between other vertices in the graph it lies on. bc is 
-% 	            a list of the centralities that were computed (ordered by 
-%               vertex number).
-% 
-%
-% REVISION
-% 12-Oct-07   1.0 Release   MIT Lincoln Laboratory.
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
-
-% Allocate the data structures and initialize variables:
-%   Name     Dimension                        Entries
-%   A      : B^(N x N)                        M
-%   bfs    : B^(batchSize x N x N)            batchSize x N
-%   nsp    : Z+^(batchSize x N)               batchSize x N
-%   bcu    : R^(batchSize x N)                batchSize x N
-%   nspInv : R^(batchSize x N)                batchSize x N
-%   w      : R^(batchSize x N)                batchSize x N
-%   fringe : Z+^(batchSize x N)               < batchSize x N
-%   bc     : R^(N)                            N
-%   batch  : Z+^(batchSize)                   batchSize
-
-% Variable Description:
-%   A         : The adjacency matrix.  An entry at (x,y) indicates an edge 
-%               coming from vertex x going to vertex y.  Used only in its 
-%               boolean form in this computation (unweighted).
-%   N         : The number of vertices in the graph.
-%   batchSize : The number of vertices to process simultaneously.  
-%               Increasing this number increases the amount of storage 
-%               required by the algorithm, but may also increase the 
-%               performance due to batch processing of the data.
-%   batch     : The vertices in the current batch to be processed.
-%   bfs       : The breadth-first search tree discovered.  An entry at 
-%               (x,y,z) indicates that for the root vertex batch(x), vertex
-%               y was discovered at depth z in the breadth-first search.
-%   nsp       : The number of shortest paths.  An entry (x,y)=m indicates
-%               that for root vertex batch(x), vertex y has m shortest
-%               paths to it.
-%   bcu       : The centrality updates.  An entry (x,y)=m indicates that
-%               root vertex batch(x) contributes m to the betweenness
-%               centrality for vertex y.
-%   nspInv    : The inverse of the number of shortest paths.  An entry
-%               (x,y)=m indicates that for root vertex batch(x), vertex y
-%               has 1/m shortest paths to it.
-%   w         : The child weights during the centrality update.  An entry
-%               (x,y)=m indicates that for root vertex batch(x), child  
-%               vertex y applies a weight of m to all its parent vertices 
-%               during the centrality update.
-%   fringe    : The current open queue of the breadth-first search.  When 
-%               the depth is d in the breadth-first search, an entry 
-%               (x,y)=m indicates that for root vertex batch(x), vertex y 
-%               is at depth d and has m paths going to it.
-%   bc        : The centrality scores.  An entry (y)=m indicates that
-%               vertex y has a betweenness centrality score of m.
-   
-% Convert the adjacency matrix to an unweighted graph, filter the edges
-A = logical(G)';
-
-% Get the number of vertices of the graph.
-N = length(A);
-
-% Initialize the centrality
-bc = zeros(1,N);
-
-% Fix any issues with the approximation and get the number of passes
-if (2^K4approx > N) % Cannot perform more than N approximations
-	K4approx = floor(log2(N));
-end
-nPasses = 2^K4approx;
-
-% Get the total number of batches
-numBatches = ceil(nPasses/batchSize);
-
-for p = 1:numBatches
-    % Zero out the BFS
-    bfs = [];
-    
-    % Get the vertices in the current batch
-    batch = ((p-1).*batchSize + 1):min(p.*batchSize,N);
-    
-    % Get the size of the current batch
-    curSize = length(batch);
-    
-    % Set the number of paths to all root vertices to one
-    nsp = accumarray([(1:curSize)',batch'],1,[curSize,N]);
-    
-    % Set the counter for the depth in the BFS
-    depth = 0;
-
-    % Set the initial fringe to be the neighbors of the root vertices
-    fringe = double(A(batch,:));
-
-    % While there are vertices in the fringe to iterate over
-    while nnz(fringe) > 0
-        % Increment the depth
-        depth = depth + 1;
-        % Add in the shortest path counts from the fringe
-        nsp = nsp + fringe;
-        % Add in the vertices discovered from the fringe to the BFS
-        bfs(depth).G = logical(fringe);
-        % Compute the the next fringe
-        fringe = (fringe * A) .* not(nsp);
-    end
-
-    % Free up memory
-    clear('fringe');  
-
-    % Pre-compute 1/nsp
-    [rows cols vals] = find(nsp);
-    if(curSize==1) rows = rows';  cols = cols'; end
-    nspInv = accumarray([rows,cols],1./vals,[curSize,N]);
-
-    % Free up memory
-    clear('rows','cols','vals');
-
-    % Pre-compute (1+bcUpdate)
-    bcu = ones(curSize,N);
-    
-    % Compute the bc update for all vertices except the sources
-    for depth = depth:-1:2
-        % Compute the weights to be applied based on the child values
-        w = (bfs(depth).G .* nspInv) .* bcu;
-        % Apply the child value weights and sum them up over the parents
-        % Then apply the weights based on parent values
-        bcu = bcu + ((A * w')' .* bfs(depth-1).G) .* nsp;
-    end
-    
-    % Update the bc with the bc update
-    bc = bc + sum(bcu,1);
-
-    % Free up memory
-    clear('w','nspInv','nsp','bcu','bfs');
-end
-
-% Subtract off the additional values added in by precomputation
-bc = bc - nPasses;
-
-
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% Copyright � 2007, Massachusetts Institute of Technology
-% All rights reserved.
-% 
-% Redistribution and use in source and binary forms, with or without
-% modification, are permitted provided that the following conditions are  
-% met:
-%    * Redistributions of source code must retain the above copyright
-%      notice, this list of conditions and the following disclaimer.
-%    * Redistributions in binary form must reproduce the above copyright
-%      notice, this list of conditions and the following disclaimer in the
-%      documentation and/or other materials provided with the distribution.
-%    * Neither the name of the Massachusetts Institute of Technology nor  
-%      the names of its contributors may be used to endorse or promote 
-%      products derived from this software without specific prior written 
-%      permission.
-%
-% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
-% IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-% THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
-% PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 
-% CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
-% EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
-% PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
-% PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
-% LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
-% NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS   
-% SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+function bc = betwCentrality( G, K4approx, batchSize )
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
+% Function betwCentrality() - Kernel 4, analyze a graph's connectivity.
+%
+% The fourth computational kernel computes the betweenness centrality for 
+% an unweighted graph, using only matrix operations. Betweenness centrality 
+% is a measure of the importance of a vertex with respect to the shortest 
+% paths between other vertices in the graph that it lies on.  This function 
+% computes an ordered list of centralities, each centrality corresponding 
+% to a specific vertex in the graph. 
+%
+% The high computational cost of kernel 4:  
+% An exact implementation would consider all of the vertices as starting 
+% points in the betweenness centrality metric; this implementation 
+% can be 'dialed' to use a subset of starting vertices to obtain an
+% approximation of the betweenness centrality.
+%
+% For a detailed description of the SCCA #2 graph analysis algorithm, 
+% please see SCCA #2 Graph Analysis Written Specification, V2.2.
+%
+% NOTES: 
+%
+% This code is the vectorized version of the pseudo-code provided in the
+% specification.  It is designed to process a full level in the search tree 
+% at a time rather than just a single vertex.  All of the operations are 
+% performed in he same way, only this code is able to perform them in 
+% parallel using sparse matrices and matrix operations.  In addition, 
+% rather than processing a single vertex at a time, it has a configurable 
+% batch size parameter.  While increasing the size of a batch increases the 
+% space required by the algorithm, it may also increase the performance.
+%
+% This uses Ulrik Brandes' Algorithm from "A faster algorithm for
+% betweenness centrality", where variables are named in the following way:
+%
+%   Ulrik Brandes                     This Code
+%   -----------------------------------------------------------------------
+%   C_B                               bc
+%   P, d                              (unused)
+%   S, Q                              bfs
+%   s                                 batch
+%   sigma                             nsp
+%   delta                             bcu
+%
+% S and Q can be stored using the same variable.  This optimization can be
+% performed in the original algorithm as well by simply using a vector for
+% storage rather than a stack and a queue.  Instead of discarding vertices
+% from the top of Q, the vector pointer is advanced.  The stack S
+% corresponds to the vertices of the array in reverse order.
+%
+% bfs is stored as a matrix rather than a vector.  Rather than looking at a
+% single vertex at a time, all vertices at a particular depth are examined.
+%
+% D is not required.  It was used previously to determine the
+% distance between two vertices.  In this implementation, this can be
+% computed by looking at bfs.  In addition, since all the nodes at a 
+% particular depth in the search are examined at the same time, all 
+% previously unseen vertices must be on shortest paths.
+%
+% P is computed rather than stored by selecting edges that go between
+% vertices at neighboring depths.
+%
+% References:
+%
+% D.A. Bader and K. Madduri, "Parallel Algorithms for Evaluating Centrality 
+% Indices in Real-world Networks",  Proc. The 35th International Conference 
+% on Parallel Processing (ICPP), Columbus, OH, August 2006.
+%
+% Ulrik Brandes, "A faster algorithm for betweenness centrality". Journal 
+% of Mathematical Sociology, 25(2):163�177, 2001.
+%
+% L.C. Freeman,  "A set of measures of centrality based on betweenness". 
+% Sociometry, 40(1):35�41, 1977.
+%
+%
+% INPUT
+%
+% G.          - [struct] graph (from kernel 1).
+%   adjMatrix - sparse weighted adjacency matrix of the graph.
+% K4approx    - [int] binary exponent of the number of times that the 
+%               algorithm is to loop, between 1 and SCALE. This 
+%               simplification reduces its computational time from O(MN) 
+%               to O(M*2^K4approx), which is important when testing large 
+%               graphs. It determines the amount of work performed by   
+%               kernel 4. When 'K4approx' equals 'SCALE', this 
+%               implementation is exact.  Otherwise, distinct vertices  
+%               are selected randomly (user).
+% batchSize   - [int] the number of vertices to process at once.  The space
+%               required by the algorithm increases linearly in this
+%               parameter.  While there is no theoretical decrease in
+%               runtime by increasing this parameter, in actual
+%               implementations performance may increase due to batch
+%               processing of the operations.
+%
+% OUTPUT
+%
+% bc          - [1D array, float] Betweenness centrality is a measure of  
+%               the importance of a vertex with respect to the shortest 
+%               paths between other vertices in the graph it lies on. bc is 
+% 	            a list of the centralities that were computed (ordered by 
+%               vertex number).
+% 
+%
+% REVISION
+% 12-Oct-07   1.0 Release   MIT Lincoln Laboratory.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
+
+% Allocate the data structures and initialize variables:
+%   Name     Dimension                        Entries
+%   A      : B^(N x N)                        M
+%   bfs    : B^(batchSize x N x N)            batchSize x N
+%   nsp    : Z+^(batchSize x N)               batchSize x N
+%   bcu    : R^(batchSize x N)                batchSize x N
+%   nspInv : R^(batchSize x N)                batchSize x N
+%   w      : R^(batchSize x N)                batchSize x N
+%   fringe : Z+^(batchSize x N)               < batchSize x N
+%   bc     : R^(N)                            N
+%   batch  : Z+^(batchSize)                   batchSize
+
+% Variable Description:
+%   A         : The adjacency matrix.  An entry at (x,y) indicates an edge 
+%               coming from vertex x going to vertex y.  Used only in its 
+%               boolean form in this computation (unweighted).
+%   N         : The number of vertices in the graph.
+%   batchSize : The number of vertices to process simultaneously.  
+%               Increasing this number increases the amount of storage 
+%               required by the algorithm, but may also increase the 
+%               performance due to batch processing of the data.
+%   batch     : The vertices in the current batch to be processed.
+%   bfs       : The breadth-first search tree discovered.  An entry at 
+%               (x,y,z) indicates that for the root vertex batch(x), vertex
+%               y was discovered at depth z in the breadth-first search.
+%   nsp       : The number of shortest paths.  An entry (x,y)=m indicates
+%               that for root vertex batch(x), vertex y has m shortest
+%               paths to it.
+%   bcu       : The centrality updates.  An entry (x,y)=m indicates that
+%               root vertex batch(x) contributes m to the betweenness
+%               centrality for vertex y.
+%   nspInv    : The inverse of the number of shortest paths.  An entry
+%               (x,y)=m indicates that for root vertex batch(x), vertex y
+%               has 1/m shortest paths to it.
+%   w         : The child weights during the centrality update.  An entry
+%               (x,y)=m indicates that for root vertex batch(x), child  
+%               vertex y applies a weight of m to all its parent vertices 
+%               during the centrality update.
+%   fringe    : The current open queue of the breadth-first search.  When 
+%               the depth is d in the breadth-first search, an entry 
+%               (x,y)=m indicates that for root vertex batch(x), vertex y 
+%               is at depth d and has m paths going to it.
+%   bc        : The centrality scores.  An entry (y)=m indicates that
+%               vertex y has a betweenness centrality score of m.
+   
+% Convert the adjacency matrix to an unweighted graph, filter the edges
+A = logical(G)';
+
+% Get the number of vertices of the graph.
+N = length(A);
+
+% Initialize the centrality
+bc = zeros(1,N);
+
+% Fix any issues with the approximation and get the number of passes
+if (2^K4approx > N) % Cannot perform more than N approximations
+	K4approx = floor(log2(N));
+end
+nPasses = 2^K4approx;
+
+% Get the total number of batches
+numBatches = ceil(nPasses/batchSize);
+
+for p = 1:numBatches
+    % Zero out the BFS
+    bfs = [];
+    
+    % Get the vertices in the current batch
+    batch = ((p-1).*batchSize + 1):min(p.*batchSize,N);
+    
+    % Get the size of the current batch
+    curSize = length(batch);
+    
+    % Set the number of paths to all root vertices to one
+    nsp = accumarray([(1:curSize)',batch'],1,[curSize,N]);
+    
+    % Set the counter for the depth in the BFS
+    depth = 0;
+
+    % Set the initial fringe to be the neighbors of the root vertices
+    fringe = double(A(batch,:));
+
+    % While there are vertices in the fringe to iterate over
+    while nnz(fringe) > 0
+        % Increment the depth
+        depth = depth + 1;
+        % Add in the shortest path counts from the fringe
+        nsp = nsp + fringe;
+        % Add in the vertices discovered from the fringe to the BFS
+        bfs(depth).G = logical(fringe);
+        % Compute the the next fringe
+        fringe = (fringe * A) .* not(nsp);
+    end
+
+    % Free up memory
+    clear('fringe');  
+
+    % Pre-compute 1/nsp
+    [rows cols vals] = find(nsp);
+    if(curSize==1) rows = rows';  cols = cols'; end
+    nspInv = accumarray([rows,cols],1./vals,[curSize,N]);
+
+    % Free up memory
+    clear('rows','cols','vals');
+
+    % Pre-compute (1+bcUpdate)
+    bcu = ones(curSize,N);
+    
+    % Compute the bc update for all vertices except the sources
+    for depth = depth:-1:2
+        % Compute the weights to be applied based on the child values
+        w = (bfs(depth).G .* nspInv) .* bcu;
+        % Apply the child value weights and sum them up over the parents
+        % Then apply the weights based on parent values
+        bcu = bcu + ((A * w')' .* bfs(depth-1).G) .* nsp;
+    end
+    
+    % Update the bc with the bc update
+    bc = bc + sum(bcu,1);
+
+    % Free up memory
+    clear('w','nspInv','nsp','bcu','bfs');
+end
+
+% Subtract off the additional values added in by precomputation
+bc = bc - nPasses;
+
+
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Copyright � 2007, Massachusetts Institute of Technology
+% All rights reserved.
+% 
+% Redistribution and use in source and binary forms, with or without
+% modification, are permitted provided that the following conditions are  
+% met:
+%    * Redistributions of source code must retain the above copyright
+%      notice, this list of conditions and the following disclaimer.
+%    * Redistributions in binary form must reproduce the above copyright
+%      notice, this list of conditions and the following disclaimer in the
+%      documentation and/or other materials provided with the distribution.
+%    * Neither the name of the Massachusetts Institute of Technology nor  
+%      the names of its contributors may be used to endorse or promote 
+%      products derived from this software without specific prior written 
+%      permission.
+%
+% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
+% IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+% THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
+% PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 
+% CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+% EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+% PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
+% PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
+% LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
+% NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS   
+% SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
diff --git a/mfiles/compress.pl b/mfiles/compress.pl
index 3d2f3bd1..8d24af1b 100755
--- a/mfiles/compress.pl
+++ b/mfiles/compress.pl
@@ -1,28 +1,28 @@
-#!/bin/perl
-#
-# Script to compress all inputs 
-#
-$matrixname="betwinput_scale";
-$transname="betwinput_transposed_scale";
-$outname="bc_scale";
-for($scale=27; $scale<=27; $scale++)
-{
-	$strtar = "tar -cvf ${outname}$scale.tar ${matrixname}$scale ${transname}$scale\n";
-	print $strtar;
-	system($strtar);
-	
-	$strzip = "pbzip2 -p8 -k -r ${outname}$scale.tar\n";
-	print $strzip;
-	system($strzip);
-
-	$strsizetar = "ls -alh ${outname}$scale.tar\n";
-	$strsizezip = "ls -alh ${outname}$scale.tar.bz2\n";
-	print $strsizetar;
-	print $strsizezip;
-	system($strsizetar);
-	system($strsizezip);
-
-	$strdel = "rm ${matrixname}$scale ${transname}$scale ${outname}$scale.tar\n";
-	print $strdel;
-	system($strdel);
-}
+#!/bin/perl
+#
+# Script to compress all inputs 
+#
+$matrixname="betwinput_scale";
+$transname="betwinput_transposed_scale";
+$outname="bc_scale";
+for($scale=27; $scale<=27; $scale++)
+{
+	$strtar = "tar -cvf ${outname}$scale.tar ${matrixname}$scale ${transname}$scale\n";
+	print $strtar;
+	system($strtar);
+	
+	$strzip = "pbzip2 -p8 -k -r ${outname}$scale.tar\n";
+	print $strzip;
+	system($strzip);
+
+	$strsizetar = "ls -alh ${outname}$scale.tar\n";
+	$strsizezip = "ls -alh ${outname}$scale.tar.bz2\n";
+	print $strsizetar;
+	print $strsizezip;
+	system($strsizetar);
+	system($strsizezip);
+
+	$strdel = "rm ${matrixname}$scale ${transname}$scale ${outname}$scale.tar\n";
+	print $strdel;
+	system($strdel);
+}
diff --git a/ms_inttypes/inttypes.h b/ms_inttypes/inttypes.h
index 25542771..4b3828a2 100644
--- a/ms_inttypes/inttypes.h
+++ b/ms_inttypes/inttypes.h
@@ -1,305 +1,305 @@
-// ISO C9x  compliant inttypes.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
-// 
-//  Copyright (c) 2006 Alexander Chemeris
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// 
-//   1. Redistributions of source code must retain the above copyright notice,
-//      this list of conditions and the following disclaimer.
-// 
-//   2. Redistributions in binary form must reproduce the above copyright
-//      notice, this list of conditions and the following disclaimer in the
-//      documentation and/or other materials provided with the distribution.
-// 
-//   3. The name of the author may be used to endorse or promote products
-//      derived from this software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// 
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef _MSC_VER // [
-#error "Use this header only with Microsoft Visual C++ compilers!"
-#endif // _MSC_VER ]
-
-#ifndef _MSC_INTTYPES_H_ // [
-#define _MSC_INTTYPES_H_
-
-#if _MSC_VER > 1000
-#pragma once
-#endif
-
-#include "stdint.h"
-
-// 7.8 Format conversion of integer types
-
-typedef struct {
-   intmax_t quot;
-   intmax_t rem;
-} imaxdiv_t;
-
-// 7.8.1 Macros for format specifiers
-
-#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [   See footnote 185 at page 198
-
-// The fprintf macros for signed integers are:
-#define PRId8       "d"
-#define PRIi8       "i"
-#define PRIdLEAST8  "d"
-#define PRIiLEAST8  "i"
-#define PRIdFAST8   "d"
-#define PRIiFAST8   "i"
-
-#define PRId16       "hd"
-#define PRIi16       "hi"
-#define PRIdLEAST16  "hd"
-#define PRIiLEAST16  "hi"
-#define PRIdFAST16   "hd"
-#define PRIiFAST16   "hi"
-
-#define PRId32       "I32d"
-#define PRIi32       "I32i"
-#define PRIdLEAST32  "I32d"
-#define PRIiLEAST32  "I32i"
-#define PRIdFAST32   "I32d"
-#define PRIiFAST32   "I32i"
-
-#define PRId64       "I64d"
-#define PRIi64       "I64i"
-#define PRIdLEAST64  "I64d"
-#define PRIiLEAST64  "I64i"
-#define PRIdFAST64   "I64d"
-#define PRIiFAST64   "I64i"
-
-#define PRIdMAX     "I64d"
-#define PRIiMAX     "I64i"
-
-#define PRIdPTR     "Id"
-#define PRIiPTR     "Ii"
-
-// The fprintf macros for unsigned integers are:
-#define PRIo8       "o"
-#define PRIu8       "u"
-#define PRIx8       "x"
-#define PRIX8       "X"
-#define PRIoLEAST8  "o"
-#define PRIuLEAST8  "u"
-#define PRIxLEAST8  "x"
-#define PRIXLEAST8  "X"
-#define PRIoFAST8   "o"
-#define PRIuFAST8   "u"
-#define PRIxFAST8   "x"
-#define PRIXFAST8   "X"
-
-#define PRIo16       "ho"
-#define PRIu16       "hu"
-#define PRIx16       "hx"
-#define PRIX16       "hX"
-#define PRIoLEAST16  "ho"
-#define PRIuLEAST16  "hu"
-#define PRIxLEAST16  "hx"
-#define PRIXLEAST16  "hX"
-#define PRIoFAST16   "ho"
-#define PRIuFAST16   "hu"
-#define PRIxFAST16   "hx"
-#define PRIXFAST16   "hX"
-
-#define PRIo32       "I32o"
-#define PRIu32       "I32u"
-#define PRIx32       "I32x"
-#define PRIX32       "I32X"
-#define PRIoLEAST32  "I32o"
-#define PRIuLEAST32  "I32u"
-#define PRIxLEAST32  "I32x"
-#define PRIXLEAST32  "I32X"
-#define PRIoFAST32   "I32o"
-#define PRIuFAST32   "I32u"
-#define PRIxFAST32   "I32x"
-#define PRIXFAST32   "I32X"
-
-#define PRIo64       "I64o"
-#define PRIu64       "I64u"
-#define PRIx64       "I64x"
-#define PRIX64       "I64X"
-#define PRIoLEAST64  "I64o"
-#define PRIuLEAST64  "I64u"
-#define PRIxLEAST64  "I64x"
-#define PRIXLEAST64  "I64X"
-#define PRIoFAST64   "I64o"
-#define PRIuFAST64   "I64u"
-#define PRIxFAST64   "I64x"
-#define PRIXFAST64   "I64X"
-
-#define PRIoMAX     "I64o"
-#define PRIuMAX     "I64u"
-#define PRIxMAX     "I64x"
-#define PRIXMAX     "I64X"
-
-#define PRIoPTR     "Io"
-#define PRIuPTR     "Iu"
-#define PRIxPTR     "Ix"
-#define PRIXPTR     "IX"
-
-// The fscanf macros for signed integers are:
-#define SCNd8       "d"
-#define SCNi8       "i"
-#define SCNdLEAST8  "d"
-#define SCNiLEAST8  "i"
-#define SCNdFAST8   "d"
-#define SCNiFAST8   "i"
-
-#define SCNd16       "hd"
-#define SCNi16       "hi"
-#define SCNdLEAST16  "hd"
-#define SCNiLEAST16  "hi"
-#define SCNdFAST16   "hd"
-#define SCNiFAST16   "hi"
-
-#define SCNd32       "ld"
-#define SCNi32       "li"
-#define SCNdLEAST32  "ld"
-#define SCNiLEAST32  "li"
-#define SCNdFAST32   "ld"
-#define SCNiFAST32   "li"
-
-#define SCNd64       "I64d"
-#define SCNi64       "I64i"
-#define SCNdLEAST64  "I64d"
-#define SCNiLEAST64  "I64i"
-#define SCNdFAST64   "I64d"
-#define SCNiFAST64   "I64i"
-
-#define SCNdMAX     "I64d"
-#define SCNiMAX     "I64i"
-
-#ifdef _WIN64 // [
-#  define SCNdPTR     "I64d"
-#  define SCNiPTR     "I64i"
-#else  // _WIN64 ][
-#  define SCNdPTR     "ld"
-#  define SCNiPTR     "li"
-#endif  // _WIN64 ]
-
-// The fscanf macros for unsigned integers are:
-#define SCNo8       "o"
-#define SCNu8       "u"
-#define SCNx8       "x"
-#define SCNX8       "X"
-#define SCNoLEAST8  "o"
-#define SCNuLEAST8  "u"
-#define SCNxLEAST8  "x"
-#define SCNXLEAST8  "X"
-#define SCNoFAST8   "o"
-#define SCNuFAST8   "u"
-#define SCNxFAST8   "x"
-#define SCNXFAST8   "X"
-
-#define SCNo16       "ho"
-#define SCNu16       "hu"
-#define SCNx16       "hx"
-#define SCNX16       "hX"
-#define SCNoLEAST16  "ho"
-#define SCNuLEAST16  "hu"
-#define SCNxLEAST16  "hx"
-#define SCNXLEAST16  "hX"
-#define SCNoFAST16   "ho"
-#define SCNuFAST16   "hu"
-#define SCNxFAST16   "hx"
-#define SCNXFAST16   "hX"
-
-#define SCNo32       "lo"
-#define SCNu32       "lu"
-#define SCNx32       "lx"
-#define SCNX32       "lX"
-#define SCNoLEAST32  "lo"
-#define SCNuLEAST32  "lu"
-#define SCNxLEAST32  "lx"
-#define SCNXLEAST32  "lX"
-#define SCNoFAST32   "lo"
-#define SCNuFAST32   "lu"
-#define SCNxFAST32   "lx"
-#define SCNXFAST32   "lX"
-
-#define SCNo64       "I64o"
-#define SCNu64       "I64u"
-#define SCNx64       "I64x"
-#define SCNX64       "I64X"
-#define SCNoLEAST64  "I64o"
-#define SCNuLEAST64  "I64u"
-#define SCNxLEAST64  "I64x"
-#define SCNXLEAST64  "I64X"
-#define SCNoFAST64   "I64o"
-#define SCNuFAST64   "I64u"
-#define SCNxFAST64   "I64x"
-#define SCNXFAST64   "I64X"
-
-#define SCNoMAX     "I64o"
-#define SCNuMAX     "I64u"
-#define SCNxMAX     "I64x"
-#define SCNXMAX     "I64X"
-
-#ifdef _WIN64 // [
-#  define SCNoPTR     "I64o"
-#  define SCNuPTR     "I64u"
-#  define SCNxPTR     "I64x"
-#  define SCNXPTR     "I64X"
-#else  // _WIN64 ][
-#  define SCNoPTR     "lo"
-#  define SCNuPTR     "lu"
-#  define SCNxPTR     "lx"
-#  define SCNXPTR     "lX"
-#endif  // _WIN64 ]
-
-#endif // __STDC_FORMAT_MACROS ]
-
-// 7.8.2 Functions for greatest-width integer types
-
-// 7.8.2.1 The imaxabs function
-#define imaxabs _abs64
-
-// 7.8.2.2 The imaxdiv function
-
-// This is modified version of div() function from Microsoft's div.c found
-// in %MSVC.NET%\crt\src\div.c
-#ifdef STATIC_IMAXDIV // [
-static
-#else // STATIC_IMAXDIV ][
-_inline
-#endif // STATIC_IMAXDIV ]
-imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
-{
-   imaxdiv_t result;
-
-   result.quot = numer / denom;
-   result.rem = numer % denom;
-
-   if (numer < 0 && result.rem > 0) {
-      // did division wrong; must fix up
-      ++result.quot;
-      result.rem -= denom;
-   }
-
-   return result;
-}
-
-// 7.8.2.3 The strtoimax and strtoumax functions
-#define strtoimax _strtoi64
-#define strtoumax _strtoui64
-
-// 7.8.2.4 The wcstoimax and wcstoumax functions
-#define wcstoimax _wcstoi64
-#define wcstoumax _wcstoui64
-
-
-#endif // _MSC_INTTYPES_H_ ]
+// ISO C9x  compliant inttypes.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
+// 
+//  Copyright (c) 2006 Alexander Chemeris
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// 
+//   1. Redistributions of source code must retain the above copyright notice,
+//      this list of conditions and the following disclaimer.
+// 
+//   2. Redistributions in binary form must reproduce the above copyright
+//      notice, this list of conditions and the following disclaimer in the
+//      documentation and/or other materials provided with the distribution.
+// 
+//   3. The name of the author may be used to endorse or promote products
+//      derived from this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// 
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_INTTYPES_H_ // [
+#define _MSC_INTTYPES_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include "stdint.h"
+
+// 7.8 Format conversion of integer types
+
+typedef struct {
+   intmax_t quot;
+   intmax_t rem;
+} imaxdiv_t;
+
+// 7.8.1 Macros for format specifiers
+
+#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [   See footnote 185 at page 198
+
+// The fprintf macros for signed integers are:
+#define PRId8       "d"
+#define PRIi8       "i"
+#define PRIdLEAST8  "d"
+#define PRIiLEAST8  "i"
+#define PRIdFAST8   "d"
+#define PRIiFAST8   "i"
+
+#define PRId16       "hd"
+#define PRIi16       "hi"
+#define PRIdLEAST16  "hd"
+#define PRIiLEAST16  "hi"
+#define PRIdFAST16   "hd"
+#define PRIiFAST16   "hi"
+
+#define PRId32       "I32d"
+#define PRIi32       "I32i"
+#define PRIdLEAST32  "I32d"
+#define PRIiLEAST32  "I32i"
+#define PRIdFAST32   "I32d"
+#define PRIiFAST32   "I32i"
+
+#define PRId64       "I64d"
+#define PRIi64       "I64i"
+#define PRIdLEAST64  "I64d"
+#define PRIiLEAST64  "I64i"
+#define PRIdFAST64   "I64d"
+#define PRIiFAST64   "I64i"
+
+#define PRIdMAX     "I64d"
+#define PRIiMAX     "I64i"
+
+#define PRIdPTR     "Id"
+#define PRIiPTR     "Ii"
+
+// The fprintf macros for unsigned integers are:
+#define PRIo8       "o"
+#define PRIu8       "u"
+#define PRIx8       "x"
+#define PRIX8       "X"
+#define PRIoLEAST8  "o"
+#define PRIuLEAST8  "u"
+#define PRIxLEAST8  "x"
+#define PRIXLEAST8  "X"
+#define PRIoFAST8   "o"
+#define PRIuFAST8   "u"
+#define PRIxFAST8   "x"
+#define PRIXFAST8   "X"
+
+#define PRIo16       "ho"
+#define PRIu16       "hu"
+#define PRIx16       "hx"
+#define PRIX16       "hX"
+#define PRIoLEAST16  "ho"
+#define PRIuLEAST16  "hu"
+#define PRIxLEAST16  "hx"
+#define PRIXLEAST16  "hX"
+#define PRIoFAST16   "ho"
+#define PRIuFAST16   "hu"
+#define PRIxFAST16   "hx"
+#define PRIXFAST16   "hX"
+
+#define PRIo32       "I32o"
+#define PRIu32       "I32u"
+#define PRIx32       "I32x"
+#define PRIX32       "I32X"
+#define PRIoLEAST32  "I32o"
+#define PRIuLEAST32  "I32u"
+#define PRIxLEAST32  "I32x"
+#define PRIXLEAST32  "I32X"
+#define PRIoFAST32   "I32o"
+#define PRIuFAST32   "I32u"
+#define PRIxFAST32   "I32x"
+#define PRIXFAST32   "I32X"
+
+#define PRIo64       "I64o"
+#define PRIu64       "I64u"
+#define PRIx64       "I64x"
+#define PRIX64       "I64X"
+#define PRIoLEAST64  "I64o"
+#define PRIuLEAST64  "I64u"
+#define PRIxLEAST64  "I64x"
+#define PRIXLEAST64  "I64X"
+#define PRIoFAST64   "I64o"
+#define PRIuFAST64   "I64u"
+#define PRIxFAST64   "I64x"
+#define PRIXFAST64   "I64X"
+
+#define PRIoMAX     "I64o"
+#define PRIuMAX     "I64u"
+#define PRIxMAX     "I64x"
+#define PRIXMAX     "I64X"
+
+#define PRIoPTR     "Io"
+#define PRIuPTR     "Iu"
+#define PRIxPTR     "Ix"
+#define PRIXPTR     "IX"
+
+// The fscanf macros for signed integers are:
+#define SCNd8       "d"
+#define SCNi8       "i"
+#define SCNdLEAST8  "d"
+#define SCNiLEAST8  "i"
+#define SCNdFAST8   "d"
+#define SCNiFAST8   "i"
+
+#define SCNd16       "hd"
+#define SCNi16       "hi"
+#define SCNdLEAST16  "hd"
+#define SCNiLEAST16  "hi"
+#define SCNdFAST16   "hd"
+#define SCNiFAST16   "hi"
+
+#define SCNd32       "ld"
+#define SCNi32       "li"
+#define SCNdLEAST32  "ld"
+#define SCNiLEAST32  "li"
+#define SCNdFAST32   "ld"
+#define SCNiFAST32   "li"
+
+#define SCNd64       "I64d"
+#define SCNi64       "I64i"
+#define SCNdLEAST64  "I64d"
+#define SCNiLEAST64  "I64i"
+#define SCNdFAST64   "I64d"
+#define SCNiFAST64   "I64i"
+
+#define SCNdMAX     "I64d"
+#define SCNiMAX     "I64i"
+
+#ifdef _WIN64 // [
+#  define SCNdPTR     "I64d"
+#  define SCNiPTR     "I64i"
+#else  // _WIN64 ][
+#  define SCNdPTR     "ld"
+#  define SCNiPTR     "li"
+#endif  // _WIN64 ]
+
+// The fscanf macros for unsigned integers are:
+#define SCNo8       "o"
+#define SCNu8       "u"
+#define SCNx8       "x"
+#define SCNX8       "X"
+#define SCNoLEAST8  "o"
+#define SCNuLEAST8  "u"
+#define SCNxLEAST8  "x"
+#define SCNXLEAST8  "X"
+#define SCNoFAST8   "o"
+#define SCNuFAST8   "u"
+#define SCNxFAST8   "x"
+#define SCNXFAST8   "X"
+
+#define SCNo16       "ho"
+#define SCNu16       "hu"
+#define SCNx16       "hx"
+#define SCNX16       "hX"
+#define SCNoLEAST16  "ho"
+#define SCNuLEAST16  "hu"
+#define SCNxLEAST16  "hx"
+#define SCNXLEAST16  "hX"
+#define SCNoFAST16   "ho"
+#define SCNuFAST16   "hu"
+#define SCNxFAST16   "hx"
+#define SCNXFAST16   "hX"
+
+#define SCNo32       "lo"
+#define SCNu32       "lu"
+#define SCNx32       "lx"
+#define SCNX32       "lX"
+#define SCNoLEAST32  "lo"
+#define SCNuLEAST32  "lu"
+#define SCNxLEAST32  "lx"
+#define SCNXLEAST32  "lX"
+#define SCNoFAST32   "lo"
+#define SCNuFAST32   "lu"
+#define SCNxFAST32   "lx"
+#define SCNXFAST32   "lX"
+
+#define SCNo64       "I64o"
+#define SCNu64       "I64u"
+#define SCNx64       "I64x"
+#define SCNX64       "I64X"
+#define SCNoLEAST64  "I64o"
+#define SCNuLEAST64  "I64u"
+#define SCNxLEAST64  "I64x"
+#define SCNXLEAST64  "I64X"
+#define SCNoFAST64   "I64o"
+#define SCNuFAST64   "I64u"
+#define SCNxFAST64   "I64x"
+#define SCNXFAST64   "I64X"
+
+#define SCNoMAX     "I64o"
+#define SCNuMAX     "I64u"
+#define SCNxMAX     "I64x"
+#define SCNXMAX     "I64X"
+
+#ifdef _WIN64 // [
+#  define SCNoPTR     "I64o"
+#  define SCNuPTR     "I64u"
+#  define SCNxPTR     "I64x"
+#  define SCNXPTR     "I64X"
+#else  // _WIN64 ][
+#  define SCNoPTR     "lo"
+#  define SCNuPTR     "lu"
+#  define SCNxPTR     "lx"
+#  define SCNXPTR     "lX"
+#endif  // _WIN64 ]
+
+#endif // __STDC_FORMAT_MACROS ]
+
+// 7.8.2 Functions for greatest-width integer types
+
+// 7.8.2.1 The imaxabs function
+#define imaxabs _abs64
+
+// 7.8.2.2 The imaxdiv function
+
+// This is modified version of div() function from Microsoft's div.c found
+// in %MSVC.NET%\crt\src\div.c
+#ifdef STATIC_IMAXDIV // [
+static
+#else // STATIC_IMAXDIV ][
+_inline
+#endif // STATIC_IMAXDIV ]
+imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
+{
+   imaxdiv_t result;
+
+   result.quot = numer / denom;
+   result.rem = numer % denom;
+
+   if (numer < 0 && result.rem > 0) {
+      // did division wrong; must fix up
+      ++result.quot;
+      result.rem -= denom;
+   }
+
+   return result;
+}
+
+// 7.8.2.3 The strtoimax and strtoumax functions
+#define strtoimax _strtoi64
+#define strtoumax _strtoui64
+
+// 7.8.2.4 The wcstoimax and wcstoumax functions
+#define wcstoimax _wcstoi64
+#define wcstoumax _wcstoui64
+
+
+#endif // _MSC_INTTYPES_H_ ]
diff --git a/ms_inttypes/stdint.h b/ms_inttypes/stdint.h
index 59d06730..d02608a5 100644
--- a/ms_inttypes/stdint.h
+++ b/ms_inttypes/stdint.h
@@ -1,247 +1,247 @@
-// ISO C9x  compliant stdint.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
-// 
-//  Copyright (c) 2006-2008 Alexander Chemeris
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// 
-//   1. Redistributions of source code must retain the above copyright notice,
-//      this list of conditions and the following disclaimer.
-// 
-//   2. Redistributions in binary form must reproduce the above copyright
-//      notice, this list of conditions and the following disclaimer in the
-//      documentation and/or other materials provided with the distribution.
-// 
-//   3. The name of the author may be used to endorse or promote products
-//      derived from this software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// 
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef _MSC_VER // [
-#error "Use this header only with Microsoft Visual C++ compilers!"
-#endif // _MSC_VER ]
-
-#ifndef _MSC_STDINT_H_ // [
-#define _MSC_STDINT_H_
-
-#if _MSC_VER > 1000
-#pragma once
-#endif
-
-#include <limits.h>
-
-// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
-// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
-// or compiler give many errors like this:
-//   error C2733: second C linkage of overloaded function 'wmemchr' not allowed
-#ifdef __cplusplus
-extern "C" {
-#endif
-#  include <wchar.h>
-#ifdef __cplusplus
-}
-#endif
-
-// Define _W64 macros to mark types changing their size, like intptr_t.
-#ifndef _W64
-#  if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
-#     define _W64 __w64
-#  else
-#     define _W64
-#  endif
-#endif
-
-
-// 7.18.1 Integer types
-
-// 7.18.1.1 Exact-width integer types
-
-// Visual Studio 6 and Embedded Visual C++ 4 doesn't
-// realize that, e.g. char has the same size as __int8
-// so we give up on __intX for them.
-#if (_MSC_VER < 1300)
-   typedef signed char       int8_t;
-   typedef signed short      int16_t;
-   typedef signed int        int32_t;
-   typedef unsigned char     uint8_t;
-   typedef unsigned short    uint16_t;
-   typedef unsigned int      uint32_t;
-#else
-   typedef signed __int8     int8_t;
-   typedef signed __int16    int16_t;
-   typedef signed __int32    int32_t;
-   typedef unsigned __int8   uint8_t;
-   typedef unsigned __int16  uint16_t;
-   typedef unsigned __int32  uint32_t;
-#endif
-typedef signed __int64       int64_t;
-typedef unsigned __int64     uint64_t;
-
-
-// 7.18.1.2 Minimum-width integer types
-typedef int8_t    int_least8_t;
-typedef int16_t   int_least16_t;
-typedef int32_t   int_least32_t;
-typedef int64_t   int_least64_t;
-typedef uint8_t   uint_least8_t;
-typedef uint16_t  uint_least16_t;
-typedef uint32_t  uint_least32_t;
-typedef uint64_t  uint_least64_t;
-
-// 7.18.1.3 Fastest minimum-width integer types
-typedef int8_t    int_fast8_t;
-typedef int16_t   int_fast16_t;
-typedef int32_t   int_fast32_t;
-typedef int64_t   int_fast64_t;
-typedef uint8_t   uint_fast8_t;
-typedef uint16_t  uint_fast16_t;
-typedef uint32_t  uint_fast32_t;
-typedef uint64_t  uint_fast64_t;
-
-// 7.18.1.4 Integer types capable of holding object pointers
-#ifdef _WIN64 // [
-   typedef signed __int64    intptr_t;
-   typedef unsigned __int64  uintptr_t;
-#else // _WIN64 ][
-   typedef _W64 signed int   intptr_t;
-   typedef _W64 unsigned int uintptr_t;
-#endif // _WIN64 ]
-
-// 7.18.1.5 Greatest-width integer types
-typedef int64_t   intmax_t;
-typedef uint64_t  uintmax_t;
-
-
-// 7.18.2 Limits of specified-width integer types
-
-#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [   See footnote 220 at page 257 and footnote 221 at page 259
-
-// 7.18.2.1 Limits of exact-width integer types
-#define INT8_MIN     ((int8_t)_I8_MIN)
-#define INT8_MAX     _I8_MAX
-#define INT16_MIN    ((int16_t)_I16_MIN)
-#define INT16_MAX    _I16_MAX
-#define INT32_MIN    ((int32_t)_I32_MIN)
-#define INT32_MAX    _I32_MAX
-#define INT64_MIN    ((int64_t)_I64_MIN)
-#define INT64_MAX    _I64_MAX
-#define UINT8_MAX    _UI8_MAX
-#define UINT16_MAX   _UI16_MAX
-#define UINT32_MAX   _UI32_MAX
-#define UINT64_MAX   _UI64_MAX
-
-// 7.18.2.2 Limits of minimum-width integer types
-#define INT_LEAST8_MIN    INT8_MIN
-#define INT_LEAST8_MAX    INT8_MAX
-#define INT_LEAST16_MIN   INT16_MIN
-#define INT_LEAST16_MAX   INT16_MAX
-#define INT_LEAST32_MIN   INT32_MIN
-#define INT_LEAST32_MAX   INT32_MAX
-#define INT_LEAST64_MIN   INT64_MIN
-#define INT_LEAST64_MAX   INT64_MAX
-#define UINT_LEAST8_MAX   UINT8_MAX
-#define UINT_LEAST16_MAX  UINT16_MAX
-#define UINT_LEAST32_MAX  UINT32_MAX
-#define UINT_LEAST64_MAX  UINT64_MAX
-
-// 7.18.2.3 Limits of fastest minimum-width integer types
-#define INT_FAST8_MIN    INT8_MIN
-#define INT_FAST8_MAX    INT8_MAX
-#define INT_FAST16_MIN   INT16_MIN
-#define INT_FAST16_MAX   INT16_MAX
-#define INT_FAST32_MIN   INT32_MIN
-#define INT_FAST32_MAX   INT32_MAX
-#define INT_FAST64_MIN   INT64_MIN
-#define INT_FAST64_MAX   INT64_MAX
-#define UINT_FAST8_MAX   UINT8_MAX
-#define UINT_FAST16_MAX  UINT16_MAX
-#define UINT_FAST32_MAX  UINT32_MAX
-#define UINT_FAST64_MAX  UINT64_MAX
-
-// 7.18.2.4 Limits of integer types capable of holding object pointers
-#ifdef _WIN64 // [
-#  define INTPTR_MIN   INT64_MIN
-#  define INTPTR_MAX   INT64_MAX
-#  define UINTPTR_MAX  UINT64_MAX
-#else // _WIN64 ][
-#  define INTPTR_MIN   INT32_MIN
-#  define INTPTR_MAX   INT32_MAX
-#  define UINTPTR_MAX  UINT32_MAX
-#endif // _WIN64 ]
-
-// 7.18.2.5 Limits of greatest-width integer types
-#define INTMAX_MIN   INT64_MIN
-#define INTMAX_MAX   INT64_MAX
-#define UINTMAX_MAX  UINT64_MAX
-
-// 7.18.3 Limits of other integer types
-
-#ifdef _WIN64 // [
-#  define PTRDIFF_MIN  _I64_MIN
-#  define PTRDIFF_MAX  _I64_MAX
-#else  // _WIN64 ][
-#  define PTRDIFF_MIN  _I32_MIN
-#  define PTRDIFF_MAX  _I32_MAX
-#endif  // _WIN64 ]
-
-#define SIG_ATOMIC_MIN  INT_MIN
-#define SIG_ATOMIC_MAX  INT_MAX
-
-#ifndef SIZE_MAX // [
-#  ifdef _WIN64 // [
-#     define SIZE_MAX  _UI64_MAX
-#  else // _WIN64 ][
-#     define SIZE_MAX  _UI32_MAX
-#  endif // _WIN64 ]
-#endif // SIZE_MAX ]
-
-// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
-#ifndef WCHAR_MIN // [
-#  define WCHAR_MIN  0
-#endif  // WCHAR_MIN ]
-#ifndef WCHAR_MAX // [
-#  define WCHAR_MAX  _UI16_MAX
-#endif  // WCHAR_MAX ]
-
-#define WINT_MIN  0
-#define WINT_MAX  _UI16_MAX
-
-#endif // __STDC_LIMIT_MACROS ]
-
-
-// 7.18.4 Limits of other integer types
-
-#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [   See footnote 224 at page 260
-
-// 7.18.4.1 Macros for minimum-width integer constants
-
-#define INT8_C(val)  val##i8
-#define INT16_C(val) val##i16
-#define INT32_C(val) val##i32
-#define INT64_C(val) val##i64
-
-#define UINT8_C(val)  val##ui8
-#define UINT16_C(val) val##ui16
-#define UINT32_C(val) val##ui32
-#define UINT64_C(val) val##ui64
-
-// 7.18.4.2 Macros for greatest-width integer constants
-#define INTMAX_C   INT64_C
-#define UINTMAX_C  UINT64_C
-
-#endif // __STDC_CONSTANT_MACROS ]
-
-
-#endif // _MSC_STDINT_H_ ]
+// ISO C9x  compliant stdint.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
+// 
+//  Copyright (c) 2006-2008 Alexander Chemeris
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// 
+//   1. Redistributions of source code must retain the above copyright notice,
+//      this list of conditions and the following disclaimer.
+// 
+//   2. Redistributions in binary form must reproduce the above copyright
+//      notice, this list of conditions and the following disclaimer in the
+//      documentation and/or other materials provided with the distribution.
+// 
+//   3. The name of the author may be used to endorse or promote products
+//      derived from this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// 
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_STDINT_H_ // [
+#define _MSC_STDINT_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include <limits.h>
+
+// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
+// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
+// or compiler give many errors like this:
+//   error C2733: second C linkage of overloaded function 'wmemchr' not allowed
+#ifdef __cplusplus
+extern "C" {
+#endif
+#  include <wchar.h>
+#ifdef __cplusplus
+}
+#endif
+
+// Define _W64 macros to mark types changing their size, like intptr_t.
+#ifndef _W64
+#  if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
+#     define _W64 __w64
+#  else
+#     define _W64
+#  endif
+#endif
+
+
+// 7.18.1 Integer types
+
+// 7.18.1.1 Exact-width integer types
+
+// Visual Studio 6 and Embedded Visual C++ 4 doesn't
+// realize that, e.g. char has the same size as __int8
+// so we give up on __intX for them.
+#if (_MSC_VER < 1300)
+   typedef signed char       int8_t;
+   typedef signed short      int16_t;
+   typedef signed int        int32_t;
+   typedef unsigned char     uint8_t;
+   typedef unsigned short    uint16_t;
+   typedef unsigned int      uint32_t;
+#else
+   typedef signed __int8     int8_t;
+   typedef signed __int16    int16_t;
+   typedef signed __int32    int32_t;
+   typedef unsigned __int8   uint8_t;
+   typedef unsigned __int16  uint16_t;
+   typedef unsigned __int32  uint32_t;
+#endif
+typedef signed __int64       int64_t;
+typedef unsigned __int64     uint64_t;
+
+
+// 7.18.1.2 Minimum-width integer types
+typedef int8_t    int_least8_t;
+typedef int16_t   int_least16_t;
+typedef int32_t   int_least32_t;
+typedef int64_t   int_least64_t;
+typedef uint8_t   uint_least8_t;
+typedef uint16_t  uint_least16_t;
+typedef uint32_t  uint_least32_t;
+typedef uint64_t  uint_least64_t;
+
+// 7.18.1.3 Fastest minimum-width integer types
+typedef int8_t    int_fast8_t;
+typedef int16_t   int_fast16_t;
+typedef int32_t   int_fast32_t;
+typedef int64_t   int_fast64_t;
+typedef uint8_t   uint_fast8_t;
+typedef uint16_t  uint_fast16_t;
+typedef uint32_t  uint_fast32_t;
+typedef uint64_t  uint_fast64_t;
+
+// 7.18.1.4 Integer types capable of holding object pointers
+#ifdef _WIN64 // [
+   typedef signed __int64    intptr_t;
+   typedef unsigned __int64  uintptr_t;
+#else // _WIN64 ][
+   typedef _W64 signed int   intptr_t;
+   typedef _W64 unsigned int uintptr_t;
+#endif // _WIN64 ]
+
+// 7.18.1.5 Greatest-width integer types
+typedef int64_t   intmax_t;
+typedef uint64_t  uintmax_t;
+
+
+// 7.18.2 Limits of specified-width integer types
+
+#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [   See footnote 220 at page 257 and footnote 221 at page 259
+
+// 7.18.2.1 Limits of exact-width integer types
+#define INT8_MIN     ((int8_t)_I8_MIN)
+#define INT8_MAX     _I8_MAX
+#define INT16_MIN    ((int16_t)_I16_MIN)
+#define INT16_MAX    _I16_MAX
+#define INT32_MIN    ((int32_t)_I32_MIN)
+#define INT32_MAX    _I32_MAX
+#define INT64_MIN    ((int64_t)_I64_MIN)
+#define INT64_MAX    _I64_MAX
+#define UINT8_MAX    _UI8_MAX
+#define UINT16_MAX   _UI16_MAX
+#define UINT32_MAX   _UI32_MAX
+#define UINT64_MAX   _UI64_MAX
+
+// 7.18.2.2 Limits of minimum-width integer types
+#define INT_LEAST8_MIN    INT8_MIN
+#define INT_LEAST8_MAX    INT8_MAX
+#define INT_LEAST16_MIN   INT16_MIN
+#define INT_LEAST16_MAX   INT16_MAX
+#define INT_LEAST32_MIN   INT32_MIN
+#define INT_LEAST32_MAX   INT32_MAX
+#define INT_LEAST64_MIN   INT64_MIN
+#define INT_LEAST64_MAX   INT64_MAX
+#define UINT_LEAST8_MAX   UINT8_MAX
+#define UINT_LEAST16_MAX  UINT16_MAX
+#define UINT_LEAST32_MAX  UINT32_MAX
+#define UINT_LEAST64_MAX  UINT64_MAX
+
+// 7.18.2.3 Limits of fastest minimum-width integer types
+#define INT_FAST8_MIN    INT8_MIN
+#define INT_FAST8_MAX    INT8_MAX
+#define INT_FAST16_MIN   INT16_MIN
+#define INT_FAST16_MAX   INT16_MAX
+#define INT_FAST32_MIN   INT32_MIN
+#define INT_FAST32_MAX   INT32_MAX
+#define INT_FAST64_MIN   INT64_MIN
+#define INT_FAST64_MAX   INT64_MAX
+#define UINT_FAST8_MAX   UINT8_MAX
+#define UINT_FAST16_MAX  UINT16_MAX
+#define UINT_FAST32_MAX  UINT32_MAX
+#define UINT_FAST64_MAX  UINT64_MAX
+
+// 7.18.2.4 Limits of integer types capable of holding object pointers
+#ifdef _WIN64 // [
+#  define INTPTR_MIN   INT64_MIN
+#  define INTPTR_MAX   INT64_MAX
+#  define UINTPTR_MAX  UINT64_MAX
+#else // _WIN64 ][
+#  define INTPTR_MIN   INT32_MIN
+#  define INTPTR_MAX   INT32_MAX
+#  define UINTPTR_MAX  UINT32_MAX
+#endif // _WIN64 ]
+
+// 7.18.2.5 Limits of greatest-width integer types
+#define INTMAX_MIN   INT64_MIN
+#define INTMAX_MAX   INT64_MAX
+#define UINTMAX_MAX  UINT64_MAX
+
+// 7.18.3 Limits of other integer types
+
+#ifdef _WIN64 // [
+#  define PTRDIFF_MIN  _I64_MIN
+#  define PTRDIFF_MAX  _I64_MAX
+#else  // _WIN64 ][
+#  define PTRDIFF_MIN  _I32_MIN
+#  define PTRDIFF_MAX  _I32_MAX
+#endif  // _WIN64 ]
+
+#define SIG_ATOMIC_MIN  INT_MIN
+#define SIG_ATOMIC_MAX  INT_MAX
+
+#ifndef SIZE_MAX // [
+#  ifdef _WIN64 // [
+#     define SIZE_MAX  _UI64_MAX
+#  else // _WIN64 ][
+#     define SIZE_MAX  _UI32_MAX
+#  endif // _WIN64 ]
+#endif // SIZE_MAX ]
+
+// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
+#ifndef WCHAR_MIN // [
+#  define WCHAR_MIN  0
+#endif  // WCHAR_MIN ]
+#ifndef WCHAR_MAX // [
+#  define WCHAR_MAX  _UI16_MAX
+#endif  // WCHAR_MAX ]
+
+#define WINT_MIN  0
+#define WINT_MAX  _UI16_MAX
+
+#endif // __STDC_LIMIT_MACROS ]
+
+
+// 7.18.4 Limits of other integer types
+
+#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [   See footnote 224 at page 260
+
+// 7.18.4.1 Macros for minimum-width integer constants
+
+#define INT8_C(val)  val##i8
+#define INT16_C(val) val##i16
+#define INT32_C(val) val##i32
+#define INT64_C(val) val##i64
+
+#define UINT8_C(val)  val##ui8
+#define UINT16_C(val) val##ui16
+#define UINT32_C(val) val##ui32
+#define UINT64_C(val) val##ui64
+
+// 7.18.4.2 Macros for greatest-width integer constants
+#define INTMAX_C   INT64_C
+#define UINTMAX_C  UINT64_C
+
+#endif // __STDC_CONSTANT_MACROS ]
+
+
+#endif // _MSC_STDINT_H_ ]