diff --git a/.gitignore b/.gitignore index 3e6b5557..a6d9c1d3 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,7 @@ test/temp_fullydistvec *.o SpGEMM3D mcl3d +_build +_install +.clangd +.cache diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..fff80899 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,76 @@ +{ + "files.associations": { + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "chrono": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "codecvt": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "condition_variable": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "forward_list": "cpp", + "list": "cpp", + "map": "cpp", + "set": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "fstream": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "mutex": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "semaphore": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "thread": "cpp", + "cinttypes": "cpp", + "typeindex": "cpp", + "typeinfo": "cpp", + "__nullptr": "cpp", + "filesystem": "cpp", + "__locale": "cpp", + "locale": "cpp", + "__config": "cpp" + } +} \ No newline at end of file diff --git a/3DSpGEMM/matlab/mmwrite.m b/3DSpGEMM/matlab/mmwrite.m index babeb5c9..53a71b7f 100644 --- a/3DSpGEMM/matlab/mmwrite.m +++ b/3DSpGEMM/matlab/mmwrite.m @@ -1,274 +1,274 @@ -function [ err ] = mmwrite(filename,A,comment,field,precision) -% -% Function: mmwrite(filename,A,comment,field,precision) -% -% Writes the sparse or dense matrix A to a Matrix Market (MM) -% formatted file. -% -% Required arguments: -% -% filename - destination file -% -% A - sparse or full matrix -% -% Optional arguments: -% -% comment - matrix of comments to prepend to -% the MM file. To build a comment matrix, -% use str2mat. For example: -% -% comment = str2mat(' Comment 1' ,... -% ' Comment 2',... -% ' and so on.',... -% ' to attach a date:',... -% [' ',date]); -% If ommitted, a single line date stamp comment -% will be included. -% -% field - 'real' -% 'complex' -% 'integer' -% 'pattern' -% If ommitted, data will determine type. -% -% precision - number of digits to display for real -% or complex values -% If ommitted, full working precision is used. -% - -if ( nargin == 5) - precision = 16; -elseif ( nargin == 4) - precision = 16; -elseif ( nargin == 3) - mattype = 'real'; % placeholder, will check after FIND-ing A - precision = 16; -elseif ( nargin == 2) - comment = ''; - % Check whether there is an imaginary part: - mattype = 'real'; % placeholder, will check after FIND-ing A - precision = 16; -end - -mmfile = fopen([filename],'w'); -if ( mmfile == -1 ) - error('Cannot open file for output'); -end; - - -[M,N] = size(A); - -%%%%%%%%%%%%% This part for sparse matrices %%%%%%%%%%%%%%%% -if ( issparse(A) ) - - [I,J,V] = find(A); - if ( sum(abs(imag(nonzeros(V)))) > 0 ) - Vreal = 0; - else - Vreal = 1; - end - - if ( ~ strcmp(mattype,'pattern') & Vreal ) - mattype = 'real'; - elseif ( ~ strcmp(mattype,'pattern') ) - mattype = 'complex'; - end -% -% Determine symmetry: -% - if ( M ~= N ) - symm = 'general'; - issymm = 0; - NZ = length(V); - else - issymm = 1; - NZ = length(V); - for i=1:NZ - if ( A(J(i),I(i)) ~= V(i) ) - issymm = 0; - break; - end - end - if ( issymm ) - symm = 'symmetric'; - ATEMP = tril(A); - [I,J,V] = find(ATEMP); - NZ = nnz(ATEMP); - else - isskew = 1; - for i=1:NZ - if ( A(J(i),I(i)) ~= - V(i) ) - isskew = 0; - break; - end - end - if ( isskew ) - symm = 'skew-symmetric'; - ATEMP = tril(A); - [I,J,V] = find(ATEMP); - NZ = nnz(ATEMP); - elseif ( strcmp(mattype,'complex') ) - isherm = 1; - for i=1:NZ - if ( A(J(i),I(i)) ~= conj(V(i)) ) - isherm = 0; - break; - end - end - if ( isherm ) - symm = 'hermitian'; - ATEMP = tril(A); - [I,J,V] = find(ATEMP); - NZ = nnz(ATEMP); - else - symm = 'general'; - NZ = nnz(A); - end - else - symm = 'general'; - NZ = nnz(A); - end - end - end - -% Sparse coordinate format: - - rep = 'coordinate'; - - - fprintf(mmfile,'%%%%MatrixMarket matrix %s %s %s\n',rep,mattype,symm); - [MC,NC] = size(comment); - if ( MC == 0 ) - fprintf(mmfile,'%% Generated %s\n',[date]); - else - for i=1:MC, - fprintf(mmfile,'%%%s\n',comment(i,:)); - end - end - fprintf(mmfile,'%d %d %d\n',M,N,NZ); - cplxformat = sprintf('%%d %%d %% .%dg %% .%dg\n',precision,precision); - realformat = sprintf('%%d %%d %% .%dg\n',precision); - if ( strcmp(mattype,'real') ) - for i=1:NZ - fprintf(mmfile,realformat,I(i),J(i),V(i)); - end; - elseif ( strcmp(mattype,'complex') ) - for i=1:NZ - fprintf(mmfile,cplxformat,I(i),J(i),real(V(i)),imag(V(i))); - end; - elseif ( strcmp(mattype,'pattern') ) - for i=1:NZ - fprintf(mmfile,'%d %d\n',I(i),J(i)); - end; - else - err = -1; - disp('Unsupported mattype:') - mattype - end; - -%%%%%%%%%%%%% This part for dense matrices %%%%%%%%%%%%%%%% -else - if ( sum(abs(imag(nonzeros(A)))) > 0 ) - Areal = 0; - else - Areal = 1; - end - if ( ~strcmp(mattype,'pattern') & Areal ) - mattype = 'real'; - elseif ( ~strcmp(mattype,'pattern') ) - mattype = 'complex'; - end -% -% Determine symmetry: -% - if ( M ~= N ) - issymm = 0; - symm = 'general'; - else - issymm = 1; - for j=1:N - for i=j+1:N - if (A(i,j) ~= A(j,i) ) - issymm = 0; - break; - end - end - if ( ~ issymm ) break; end - - end - if ( issymm ) - symm = 'symmetric'; - else - isskew = 1; - for j=1:N - for i=j+1:N - if (A(i,j) ~= - A(j,i) ) - isskew = 0; - break; - end - end - if ( ~ isskew ) break; end - end - if ( isskew ) - symm = 'skew-symmetric'; - elseif ( strcmp(mattype,'complex') ) - isherm = 1; - for j=1:N - for i=j+1:N - if (A(i,j) ~= conj(A(j,i)) ) - isherm = 0; - break; - end - end - if ( ~ isherm ) break; end - end - if ( isherm ) - symm = 'hermitian'; - else - symm = 'general'; - end - else - symm = 'general'; - end - end - end - -% Dense array format: - - rep = 'array'; - [MC,NC] = size(comment); - fprintf(mmfile,'%%%%MatrixMarket matrix %s %s %s\n',rep,mattype,symm); - for i=1:MC, - fprintf(mmfile,'%%%s\n',comment(i,:)); - end; - fprintf(mmfile,'%d %d\n',M,N); - cplxformat = sprintf('%% .%dg %% .%dg\n', precision,precision); - realformat = sprintf('%% .%dg\n', precision); - if ( ~ strcmp(symm,'general') ) - rowloop = 'j'; - else - rowloop = '1'; - end - if ( strcmp(mattype,'real') ) - for j=1:N - for i=eval(rowloop):M - fprintf(mmfile,realformat,A(i,j)); - end - end - elseif ( strcmp(mattype,'complex') ) - for j=1:N - for i=eval(rowloop):M - fprintf(mmfile,cplxformat,real(A(i,j)),imag(A(i,j))); - end - end - elseif ( strcmp(mattype,'pattern') ) - err = -2 - disp('Pattern type inconsistant with dense matrix') - else - err = -2 - disp('Unknown matrix type:') - mattype - end -end - -fclose(mmfile); +function [ err ] = mmwrite(filename,A,comment,field,precision) +% +% Function: mmwrite(filename,A,comment,field,precision) +% +% Writes the sparse or dense matrix A to a Matrix Market (MM) +% formatted file. +% +% Required arguments: +% +% filename - destination file +% +% A - sparse or full matrix +% +% Optional arguments: +% +% comment - matrix of comments to prepend to +% the MM file. To build a comment matrix, +% use str2mat. For example: +% +% comment = str2mat(' Comment 1' ,... +% ' Comment 2',... +% ' and so on.',... +% ' to attach a date:',... +% [' ',date]); +% If ommitted, a single line date stamp comment +% will be included. +% +% field - 'real' +% 'complex' +% 'integer' +% 'pattern' +% If ommitted, data will determine type. +% +% precision - number of digits to display for real +% or complex values +% If ommitted, full working precision is used. +% + +if ( nargin == 5) + precision = 16; +elseif ( nargin == 4) + precision = 16; +elseif ( nargin == 3) + mattype = 'real'; % placeholder, will check after FIND-ing A + precision = 16; +elseif ( nargin == 2) + comment = ''; + % Check whether there is an imaginary part: + mattype = 'real'; % placeholder, will check after FIND-ing A + precision = 16; +end + +mmfile = fopen([filename],'w'); +if ( mmfile == -1 ) + error('Cannot open file for output'); +end; + + +[M,N] = size(A); + +%%%%%%%%%%%%% This part for sparse matrices %%%%%%%%%%%%%%%% +if ( issparse(A) ) + + [I,J,V] = find(A); + if ( sum(abs(imag(nonzeros(V)))) > 0 ) + Vreal = 0; + else + Vreal = 1; + end + + if ( ~ strcmp(mattype,'pattern') & Vreal ) + mattype = 'real'; + elseif ( ~ strcmp(mattype,'pattern') ) + mattype = 'complex'; + end +% +% Determine symmetry: +% + if ( M ~= N ) + symm = 'general'; + issymm = 0; + NZ = length(V); + else + issymm = 1; + NZ = length(V); + for i=1:NZ + if ( A(J(i),I(i)) ~= V(i) ) + issymm = 0; + break; + end + end + if ( issymm ) + symm = 'symmetric'; + ATEMP = tril(A); + [I,J,V] = find(ATEMP); + NZ = nnz(ATEMP); + else + isskew = 1; + for i=1:NZ + if ( A(J(i),I(i)) ~= - V(i) ) + isskew = 0; + break; + end + end + if ( isskew ) + symm = 'skew-symmetric'; + ATEMP = tril(A); + [I,J,V] = find(ATEMP); + NZ = nnz(ATEMP); + elseif ( strcmp(mattype,'complex') ) + isherm = 1; + for i=1:NZ + if ( A(J(i),I(i)) ~= conj(V(i)) ) + isherm = 0; + break; + end + end + if ( isherm ) + symm = 'hermitian'; + ATEMP = tril(A); + [I,J,V] = find(ATEMP); + NZ = nnz(ATEMP); + else + symm = 'general'; + NZ = nnz(A); + end + else + symm = 'general'; + NZ = nnz(A); + end + end + end + +% Sparse coordinate format: + + rep = 'coordinate'; + + + fprintf(mmfile,'%%%%MatrixMarket matrix %s %s %s\n',rep,mattype,symm); + [MC,NC] = size(comment); + if ( MC == 0 ) + fprintf(mmfile,'%% Generated %s\n',[date]); + else + for i=1:MC, + fprintf(mmfile,'%%%s\n',comment(i,:)); + end + end + fprintf(mmfile,'%d %d %d\n',M,N,NZ); + cplxformat = sprintf('%%d %%d %% .%dg %% .%dg\n',precision,precision); + realformat = sprintf('%%d %%d %% .%dg\n',precision); + if ( strcmp(mattype,'real') ) + for i=1:NZ + fprintf(mmfile,realformat,I(i),J(i),V(i)); + end; + elseif ( strcmp(mattype,'complex') ) + for i=1:NZ + fprintf(mmfile,cplxformat,I(i),J(i),real(V(i)),imag(V(i))); + end; + elseif ( strcmp(mattype,'pattern') ) + for i=1:NZ + fprintf(mmfile,'%d %d\n',I(i),J(i)); + end; + else + err = -1; + disp('Unsupported mattype:') + mattype + end; + +%%%%%%%%%%%%% This part for dense matrices %%%%%%%%%%%%%%%% +else + if ( sum(abs(imag(nonzeros(A)))) > 0 ) + Areal = 0; + else + Areal = 1; + end + if ( ~strcmp(mattype,'pattern') & Areal ) + mattype = 'real'; + elseif ( ~strcmp(mattype,'pattern') ) + mattype = 'complex'; + end +% +% Determine symmetry: +% + if ( M ~= N ) + issymm = 0; + symm = 'general'; + else + issymm = 1; + for j=1:N + for i=j+1:N + if (A(i,j) ~= A(j,i) ) + issymm = 0; + break; + end + end + if ( ~ issymm ) break; end + + end + if ( issymm ) + symm = 'symmetric'; + else + isskew = 1; + for j=1:N + for i=j+1:N + if (A(i,j) ~= - A(j,i) ) + isskew = 0; + break; + end + end + if ( ~ isskew ) break; end + end + if ( isskew ) + symm = 'skew-symmetric'; + elseif ( strcmp(mattype,'complex') ) + isherm = 1; + for j=1:N + for i=j+1:N + if (A(i,j) ~= conj(A(j,i)) ) + isherm = 0; + break; + end + end + if ( ~ isherm ) break; end + end + if ( isherm ) + symm = 'hermitian'; + else + symm = 'general'; + end + else + symm = 'general'; + end + end + end + +% Dense array format: + + rep = 'array'; + [MC,NC] = size(comment); + fprintf(mmfile,'%%%%MatrixMarket matrix %s %s %s\n',rep,mattype,symm); + for i=1:MC, + fprintf(mmfile,'%%%s\n',comment(i,:)); + end; + fprintf(mmfile,'%d %d\n',M,N); + cplxformat = sprintf('%% .%dg %% .%dg\n', precision,precision); + realformat = sprintf('%% .%dg\n', precision); + if ( ~ strcmp(symm,'general') ) + rowloop = 'j'; + else + rowloop = '1'; + end + if ( strcmp(mattype,'real') ) + for j=1:N + for i=eval(rowloop):M + fprintf(mmfile,realformat,A(i,j)); + end + end + elseif ( strcmp(mattype,'complex') ) + for j=1:N + for i=eval(rowloop):M + fprintf(mmfile,cplxformat,real(A(i,j)),imag(A(i,j))); + end + end + elseif ( strcmp(mattype,'pattern') ) + err = -2 + disp('Pattern type inconsistant with dense matrix') + else + err = -2 + disp('Unknown matrix type:') + mattype + end +end + +fclose(mmfile); diff --git a/Applications/CMakeLists.txt b/Applications/CMakeLists.txt index a5fb38e9..1d978f5f 100644 --- a/Applications/CMakeLists.txt +++ b/Applications/CMakeLists.txt @@ -1,11 +1,10 @@ # Top level directory has the include files - ADD_EXECUTABLE( tdbfs TopDownBFS.cpp ) ADD_EXECUTABLE( dobfs DirOptBFS.cpp ) ADD_EXECUTABLE( fbfs FilteredBFS.cpp ) ADD_EXECUTABLE( fmis FilteredMIS.cpp ) -ADD_EXECUTABLE( mcl MCL.cpp ) +#ADD_EXECUTABLE( mcl MCL.cpp ) ADD_EXECUTABLE( betwcent BetwCent.cpp ) ADD_EXECUTABLE( lacc CC.cpp) @@ -13,7 +12,7 @@ TARGET_LINK_LIBRARIES( tdbfs CombBLAS) TARGET_LINK_LIBRARIES( dobfs CombBLAS) TARGET_LINK_LIBRARIES( fbfs CombBLAS) TARGET_LINK_LIBRARIES( fmis CombBLAS) -TARGET_LINK_LIBRARIES( mcl CombBLAS) +#TARGET_LINK_LIBRARIES( mcl CombBLAS) TARGET_LINK_LIBRARIES( betwcent CombBLAS) TARGET_LINK_LIBRARIES( lacc CombBLAS) diff --git a/Applications/Incremental/CMakeLists.txt b/Applications/Incremental/CMakeLists.txt index 1ce6c2b6..6dd61baf 100644 --- a/Applications/Incremental/CMakeLists.txt +++ b/Applications/Incremental/CMakeLists.txt @@ -10,7 +10,7 @@ ADD_EXECUTABLE(full Full.cpp) ADD_EXECUTABLE(testideas Test.cpp) ADD_EXECUTABLE(prep-data Prep-Data.cpp) ADD_EXECUTABLE(prep-data-metaclust Prep-Data-Metaclust.cpp) -ADD_EXECUTABLE(lcc LargestCC.cpp) +#ADD_EXECUTABLE(lcc LargestCC.cpp) ADD_EXECUTABLE(inc-pipeline Incremental-Pipeline.cpp) ADD_EXECUTABLE(inc-baseline-pipeline Incremental-Baseline-Pipeline.cpp) ADD_EXECUTABLE(inc-toy-pipeline Incremental-Toy-Pipeline.cpp) @@ -20,7 +20,7 @@ TARGET_LINK_LIBRARIES( full CombBLAS ) TARGET_LINK_LIBRARIES( testideas CombBLAS ) TARGET_LINK_LIBRARIES( prep-data CombBLAS ) TARGET_LINK_LIBRARIES( prep-data-metaclust CombBLAS ) -TARGET_LINK_LIBRARIES( lcc CombBLAS ) +#TARGET_LINK_LIBRARIES( lcc CombBLAS ) TARGET_LINK_LIBRARIES( inc-pipeline CombBLAS ) TARGET_LINK_LIBRARIES( inc-baseline-pipeline CombBLAS ) TARGET_LINK_LIBRARIES( inc-toy-pipeline CombBLAS ) diff --git a/CMakeLists.txt b/CMakeLists.txt index 27cf1521..c03d6043 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.3) project(CombBLAS VERSION 2.0.1 LANGUAGES C CXX) - +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # require c++14 set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED YES) @@ -18,14 +18,40 @@ else() target_compile_features(CombBLAS PUBLIC cxx_return_type_deduction) endif() + # set include directories target_include_directories(CombBLAS PUBLIC $ $) target_include_directories(CombBLAS PUBLIC $ $) target_include_directories(CombBLAS PRIVATE include/CombBLAS) -# MPI and OpenMP dependencies +# MPI and OpenMP and CUDA dependencies find_package(MPI REQUIRED) find_package(OpenMP) +# This needs to be split based on if CMake >= 3.17, as this is deprecated above that +find_package(CUDA) + + +if(CUDA_FOUND) + #target_compile_definitions(CombBLAS PUBLIC GPU_ENABLED) + enable_language(CUDA) + #set(CUDA_HOST_COMPILER "nvc++") # NVHPC requires this...sorry if this causes any issues for anyone + set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" "--ftemplate-backtrace-limit 1 --expt-relaxed-constexpr --disable-warnings") + #FILE(GLOB_RECURSE MyCSources *) + #FILE(GLOB_RECURSE MyHSources *.h*) + #cset_source_files_properties(${MyCSources} PROPERTIES LANGUAGE CUDA) + #set_source_files_properties(src/CommGrid.cpp src/MPIType.cpp src/MPIOp.cpp src/MemoryPool.cpp src/hash.cpp PROPERTIES LANGUAGE CXX) + #set_source_files_properties(src/mmio.c PROPERTIES LANGUAGE C) + # Hack to avoid using #include + #target_link_libraries(CombBLAS PUBLIC ${CUDA_LIBRARIES}) + #cuda_compile(CUDASpGEMM include/CombBLAS/cudaSpGEMM.cu) + #target_sources(CombBLAS PRIVATE ${CUDASpGEMM}) + set(THREADS_PREFER_PTHREAD_FLAG OFF) + set_property(TARGET Threads::Threads + PROPERTY INTERFACE_COMPILE_OPTIONS $<$:-Xcompiler -pthread> + "$<$>:-pthread>") +else() + message(STATUS "CUDA features disabled") +endif() if(TARGET MPI::MPI_CXX) # Use target if available (CMake >= 3.9) target_link_libraries(CombBLAS PUBLIC MPI::MPI_CXX) @@ -44,6 +70,9 @@ elseif(OPENMP_FOUND) target_link_libraries(CombBLAS PUBLIC "${OpenMP_CXX_FLAGS}") endif() + + + add_subdirectory(usort) target_link_libraries(CombBLAS PUBLIC Usortlib) @@ -116,6 +145,9 @@ install( enable_testing() include(CTest) +# Warnings cause the compiler to crash, surpress them to prevent that +add_definitions(-w) + add_subdirectory(ReleaseTests) add_subdirectory(Applications) add_subdirectory(Applications/Ordering) diff --git a/FAQ-combblas-old.html b/FAQ-combblas-old.html index c52d8b18..74050a3f 100644 --- a/FAQ-combblas-old.html +++ b/FAQ-combblas-old.html @@ -1,1397 +1,1397 @@ - - - - - - - - - - -The Combinatorial BLAS Release Notes - - - - - - - - - - - - -
- -

Frequently Asked Questions about -Combinatorial BLAS

- -

Go back -to the the Combinatorial BLAS home page.

- -

 

- -

 

- -

Q1: I would like to use your Combinatorial BLAS code for -some of my experiments which involve sparse matrix multiplication. However, it -is not clear how to write an output of PSpGEMM(…) -function to a file. I've tried to use "put" function of SpParMat, but -it outputs part of the matrix that corresponds to particular process and not -the whole matrix. Is there a way to do it using your code?

- -

 

- -

A1: Yes, SpParMat::SaveGathered(…) -will create a single file output (albeit slow) sorted with increasing row id's -when called like A.SaveGathered("product.txt"). -The caveat is that  "gathered" I/O in human readable form is -quite slow due to serialization on large processor counts. It should only be -used for debugging, ideally. For vectors, we have a much much -faster version:  -FullyDistVec::ParallelWrite (…), which -should be used instead. SpParMat will also get a ParallelWrite soon.

- -

----

- -

 

- -

Q2: Does Combinatorial BLAS support in-node multithreading? 

- -

 

- -

A2: Almost all expensive primitives -(SpGEMM, SpMV with sparse vectors, SpMV with dense vectors, EWiseMult, Apply, -Set) are hybrid multithreaded within a socket. Read this example.

- -

 

- -

---

- -

 

- -

Q3: Reading/writing text files is really slow for my -purposes, what can I do?

- -

 

- -

A3: Starting from version 1.6, we now have -extremely fast matrix market (text) file reading, check out -SpParMat::ParallelReadMM() and FullyDistVec::ParallelReadMM()

- -

 

- -

---

- -

 

- -

Q4: Is there a preferred way to prune elements from a -SpParMat according to a predicate?

- -

 

- -

A4: Yes, SpParMat::Prune(…) will do it according to a predicate. An -overloaded version of the same function, SpParMat::Prune(ri,ci) -will prune all entries whose row indices are in ri and column indices are in ci

- -

 

- -

---

- -

 

- -

Q5: I am trying to run CombBLAS on Windows but the MS MPI does -not seem to support MPI C++ bindings.

- -

 

- -

A5: Combinatorial BLAS recently (version 1.3.0) switched to -C-API for all its internal MPI calls. After that, we've also compiled CombLAS -on a windows machine without problems. However, we recommend using an open -source MPI for windows too, such as MPICH-2.

- -

 

- -

---

- -

 

- -

Q6: I would like to use Combinatorial BLAS for some parallel -sparse matrix-matrix multiplications. This works quite well, however when I try -to assign a m x 1 sparse matrix (actually a vector) to the first column of an -existing sparse matrix with SpAsgn I get an error saying: "Matrix is too -small to be splitted". Is this because it's not possible to use SpAsgn on -vector-like matrices?

- -

 

- -

A6: SpAsgn internally uses a memory efficient Mult_AnXBn_DoubleBuff as opposed to Mult_AnXBn_Synch). -You might probably go into SpParMat<IT,NT,DER>::SpAsgn(...) -and change occuranges of Mult_AnXBn_DoubleBuff -to Mult_AnXBn_Synch. However, this will likely -only solve your problem for the serial case. because ComBBLAS can not -effectively 2D decompose an m x 1 matrix: each dimension should ideally be at -least sqrt(p). It is much better to represent that vector as a FullyDistSpVec.

- -

 

- -

---

- -

 

- -

Q7: Starting from a general sparse matrix Z, I want to -construct the symmetric matrix M: [[X'X; X'Z];[Z'X; Z'Z]], where X is a vector -of 1's. Thus the element at position (1,1) is simply the number of columns of -Z, and the vector Z'X contains the sums per column of Z. For now, I have a -working code, but it is quite sloppy because I do not find a function for which -I can easily increase the dimension of a sparse matrix or even set an element -to a specific value. Is there any function in Combinatorial BLAS which can do -this? 

- -

 

- -

A7: Not out of the box. You don't want an SpAsgn or any -variant of it because it can't grow the matrix. You want some sort of matrix -append. How about using Find(…) and Sparse(…)?  The Matlab care of what -you want to do is:

- -

 

- -

X = ones(size(Z,2),1) 

- -

M = [X' * X, X' * Z; Z'* X, Z' * Z]

- -

 

- -

Supporting such a general concatenation efficiently might be hard -to add at this point. Instead,  there is a Concatenate(…) -function for vectors. Armed with Concatenate(…), find(), and the sparse-like -constructor, one can solve your problem.  Check out the working example in -ReleaseTests/FindSparse.cpp

- -

 

- -

---

- -

 

- -

Q8: Does CombBLAS include the API to perform a symmetric -permutation on a matrix, as explained in your SISC -paper

- -

 

- -

A8: Yes it does. Check out the -ReleaseTests/IndexingTiming.cpp for an example.

- -

 

- -

--- 

- -

 

- -

Q9: How can I use small test case to see whether the -operation on matrix is correct? In other words, how do I print all the -information of a matrix with each value in matrix? 

- -

I can use PrintInfo to print basic information, but it only gives -me number of rows and columns and nnz

- -

 

- -

A9: Our recommendation is to use SaveGathered(…) to dump the whole matrix into a file in triples -(matrix market) format. For vectors, we have a much much faster version:  FullyDistVec::ParallelWrite (…)

- -

 

- -

---

- -

 

- -

Q10: Does CombBLAS code run on any -graph size or there is some limitation on the dimension of the matrix A. I mean -should it be a multiple of sqrt(p) where p is total number of processors. 

- -

 

- -

A10: No, the matrix dimension does not have to be a multiple -of sqrt(p) but it should be bigger than sqrt(p). In other words you can have a -5x5 matrix on 4 processors but not on 36 processors. We don't really see the -point of using more than |V|^2 processors.

- -

 

- -

---

- -

 

- -

Q11: My comparison results on real graph inputs revealed -something weird. In input loc-gowalla, how can 16 processors time(called -time_16) and 

- -

64 processors time(called time_64) which time_64*4<time_16 - which is more than linear scale? 

- -

 

- -

A11: The complexity of the parallel algorithm drops as -sub-matrices owned by each processor gets sparser. In particular, it is -proportional to O(flops x log(ni)) where ni is the size of the intersection of -the set of nonzero columns of Aik and nonzero rows of Bkj for A*B. What might -happen as p increases is that there is a phase transition that makes ni drop -significantly for your input (for p=64, each sub-matrix will have only ~1.2 -nonzeros per row or column). More details are in the SISC -paper and the references therein. Hope this makes sense. This is why -I don't suggest people use CombBLAS for small p (< 40) because it is not on -the top of its game for small number of processors. 

- -

 

- -

--- 

- -

 

- -

Q12: Should the input file have nodes numbered from 1 or it -is fine if the nodes are numbered from 0?

- -

 

- -

A12: If you're using the human readable matrix market format -as your input, then it should be 1-indexed. 

- -

 

- -

---

- -

 

- -

Q13: I'm wondering for breadth-first-search, under the hood -does the matrix-vector multiplication method change based on the sparsity of -the frontier vector, or does the underlying matrix-vector multiplication assume -the frontier is always sparse?

- -

 

- -

A13: Depending on your definition of sparseness, the -frontier is almost always sparse. We use the pragmatic definition of -"sparse" in the sense that a vector is sparse if it is worth taking -advantage of the sparsity in there. I'd guess, for a dense vector assumption to -be competitive, it would have to have at least 1/3 of its potential locations -nonzero. However, I might be wrong (and you're welcome to prove me wrong). To -answer your question more directly, CombBLAS supports both dense and sparse -right hand side vectors, but the specific BFS implementation does not -adapt. 

- -

 

- -

---

- -

 

- -

Q14: Could you briefly explain the difference in your -implementations of matrix-sparse vector and matrix-dense vector multiply? For -example, is the sparse vector case a write-based approach: Every element -updates all of its neighbors (from a graph-theoretic standpoint) locations in -the output vector; and the dense vector case a read-based approach: Every -element reads some value from each of its neighbors and updates its own entry -in the resulting vector?

- -

 

- -

A14: Sparse matrix-sparse vector is "right hand side -vector structure" driven. In y = A*x, for each nonzero x_i, we scale the -column A(:,i) with that and merge the scaled sparse columns results into y. The -computation boils down into merging sparse columns into one. Combinatorial -BLAS is a matrix-vector based library, so thinking in -terms of updates on single entries is probably not the right abstraction.

- -

 

- -

Sparse matrix-dense vector is slightly different in the sense that -it is driven by the matrix structure; you basically stream the matrix. The -correctness of both operations are handled by a SPA-like or heap-like data -structure that merges multiple intermediate values contributing to the same -output location; no atomics are used.

- -

 

- -

--- 

- -

 

- -

Q15: I would like to get your opinion -on how sparse-matrix based implementations compare with more native -implementations

- -

 

- -

A15: Sparse matrix abstraction, like -any abstraction, will leave some performance on the table. In particular it is -prone to performing extra passes over data or creating extra temporaries (if -you've ever programmed in Matlab; this is similar). On the other hand, sparse -matrix abstraction gives you "primitives" to implement graph -"algorithms" as opposed to the algorithms themselves. For instance, -CombBLAS has sparse matrix x sparse vector over a semiring as opposed to BFS, -because now using the same primitive one can implement MIS (maximal independent -set) too, only by changing the semiring. Or one can perform run time -filtering on edges based on the attributes, similarly by changing the semiring -functions (therefore extending functionality to semantic graphs). Indeed this -is what we've done in our upcoming IPDPS'13 paper.

- -

 

- -

---

- -

 

- -

Q16: Is there an effort to incorporate the bottom-up BFS of -Scott Beamer into CombBLAS?

- -

 

- -

A16: Yes, it is already done. Just use the dobfs executable -(made from DirOptBFS.cpp).

- -

 

- -

---

- -

 

- -

Q17: My serial code is faster than CombBLAS on a single -core.

- -

 

- -

A17: I believe that. CombBLAS targets -"scalability", not optimizing the single core performance.

- -

 

- -

Examples:

- -

- think about the 2D BFS. CombBLAS does not use a CSR like data -structure because that is not memory scalable due to problems of hypersparsity in large concurrencies. Instead -CombBLAS opts to use a slower (about 2x around 1000 cores) but memory -scalable format called DCSC.  

- -

- think about betweenness centrality which uses sparse -matrix-matrix multiply. CombBLAS doesn't use the fastest serial algorithm as -its subroutine because it doesn't  scale to thousands of cores. Instead it -uses a outer-product algorithm that is significantly slower for p=1, but scales -indefinitely.

- -

 

- -

--- 

- -

 

- -

Q18: Looking at the output of your Graph500 application, I -noticed a large number of self-edges removed. That’s very interesting.

- -

 

- -

A18: The duplicate edges problem is inherent to the R-MAT -generator on large scale, unless some special kind of noise is added. Check -here for a great analysis of this phenomenon: http://arxiv.org/abs/1102.5046

- -

 

- -

---

- -

 

- -

Q19: How are you counting the number of edges traversed in -Graph500? Is this still using the original verify.c file provided with the -reference version of the Graph500 benchmark and passing in the parent tree?

- -

 

- -

A19: It is calculated by summing the -degrees of the discovered vertices using EWiseMult(…) -followed by a Reduce(…). Degrees are pre-symmetrization -(original edges), so we're not over-counting. However, we count self-loops and -duplicates as mentioned in the benchmark specs.

- -

 

- -

---

- -

 

- -

Q20: My computations -finishes fine but I get an “Attempting to use an MPI routine after finalizing -MPICH” afterwards.

- -

 

- -

A20: To avoid the -finalization error, please imitate an example such as MultTest.cpp: http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/_mult_test_8cpp_source.html

- -

The curly brackets around the code are intentional. Since -distributed objects have MPI related pointers in them, those pointers are -released once the destructors are called. In C++ (at least until C++11) there -isn’t a good way to call the destructor manually, so the destructor is called -immediately before the program exists, which is after the MPI_Finalize. Since -the MPI related objects are destructed after MPI_Finalize, you see this error. -Try the curly brackets approach.

- -

 

- -

Go back -to the the Combinatorial BLAS home page.

- -
- - - - + + + + + + + + + + +The Combinatorial BLAS Release Notes + + + + + + + + + + + + +
+ +

Frequently Asked Questions about +Combinatorial BLAS

+ +

Go back +to the the Combinatorial BLAS home page.

+ +

 

+ +

 

+ +

Q1: I would like to use your Combinatorial BLAS code for +some of my experiments which involve sparse matrix multiplication. However, it +is not clear how to write an output of PSpGEMM(…) +function to a file. I've tried to use "put" function of SpParMat, but +it outputs part of the matrix that corresponds to particular process and not +the whole matrix. Is there a way to do it using your code?

+ +

 

+ +

A1: Yes, SpParMat::SaveGathered(…) +will create a single file output (albeit slow) sorted with increasing row id's +when called like A.SaveGathered("product.txt"). +The caveat is that  "gathered" I/O in human readable form is +quite slow due to serialization on large processor counts. It should only be +used for debugging, ideally. For vectors, we have a much much +faster version:  +FullyDistVec::ParallelWrite (…), which +should be used instead. SpParMat will also get a ParallelWrite soon.

+ +

----

+ +

 

+ +

Q2: Does Combinatorial BLAS support in-node multithreading? 

+ +

 

+ +

A2: Almost all expensive primitives +(SpGEMM, SpMV with sparse vectors, SpMV with dense vectors, EWiseMult, Apply, +Set) are hybrid multithreaded within a socket. Read this example.

+ +

 

+ +

---

+ +

 

+ +

Q3: Reading/writing text files is really slow for my +purposes, what can I do?

+ +

 

+ +

A3: Starting from version 1.6, we now have +extremely fast matrix market (text) file reading, check out +SpParMat::ParallelReadMM() and FullyDistVec::ParallelReadMM()

+ +

 

+ +

---

+ +

 

+ +

Q4: Is there a preferred way to prune elements from a +SpParMat according to a predicate?

+ +

 

+ +

A4: Yes, SpParMat::Prune(…) will do it according to a predicate. An +overloaded version of the same function, SpParMat::Prune(ri,ci) +will prune all entries whose row indices are in ri and column indices are in ci

+ +

 

+ +

---

+ +

 

+ +

Q5: I am trying to run CombBLAS on Windows but the MS MPI does +not seem to support MPI C++ bindings.

+ +

 

+ +

A5: Combinatorial BLAS recently (version 1.3.0) switched to +C-API for all its internal MPI calls. After that, we've also compiled CombLAS +on a windows machine without problems. However, we recommend using an open +source MPI for windows too, such as MPICH-2.

+ +

 

+ +

---

+ +

 

+ +

Q6: I would like to use Combinatorial BLAS for some parallel +sparse matrix-matrix multiplications. This works quite well, however when I try +to assign a m x 1 sparse matrix (actually a vector) to the first column of an +existing sparse matrix with SpAsgn I get an error saying: "Matrix is too +small to be splitted". Is this because it's not possible to use SpAsgn on +vector-like matrices?

+ +

 

+ +

A6: SpAsgn internally uses a memory efficient Mult_AnXBn_DoubleBuff as opposed to Mult_AnXBn_Synch). +You might probably go into SpParMat<IT,NT,DER>::SpAsgn(...) +and change occuranges of Mult_AnXBn_DoubleBuff +to Mult_AnXBn_Synch. However, this will likely +only solve your problem for the serial case. because ComBBLAS can not +effectively 2D decompose an m x 1 matrix: each dimension should ideally be at +least sqrt(p). It is much better to represent that vector as a FullyDistSpVec.

+ +

 

+ +

---

+ +

 

+ +

Q7: Starting from a general sparse matrix Z, I want to +construct the symmetric matrix M: [[X'X; X'Z];[Z'X; Z'Z]], where X is a vector +of 1's. Thus the element at position (1,1) is simply the number of columns of +Z, and the vector Z'X contains the sums per column of Z. For now, I have a +working code, but it is quite sloppy because I do not find a function for which +I can easily increase the dimension of a sparse matrix or even set an element +to a specific value. Is there any function in Combinatorial BLAS which can do +this? 

+ +

 

+ +

A7: Not out of the box. You don't want an SpAsgn or any +variant of it because it can't grow the matrix. You want some sort of matrix +append. How about using Find(…) and Sparse(…)?  The Matlab care of what +you want to do is:

+ +

 

+ +

X = ones(size(Z,2),1) 

+ +

M = [X' * X, X' * Z; Z'* X, Z' * Z]

+ +

 

+ +

Supporting such a general concatenation efficiently might be hard +to add at this point. Instead,  there is a Concatenate(…) +function for vectors. Armed with Concatenate(…), find(), and the sparse-like +constructor, one can solve your problem.  Check out the working example in +ReleaseTests/FindSparse.cpp

+ +

 

+ +

---

+ +

 

+ +

Q8: Does CombBLAS include the API to perform a symmetric +permutation on a matrix, as explained in your SISC +paper

+ +

 

+ +

A8: Yes it does. Check out the +ReleaseTests/IndexingTiming.cpp for an example.

+ +

 

+ +

--- 

+ +

 

+ +

Q9: How can I use small test case to see whether the +operation on matrix is correct? In other words, how do I print all the +information of a matrix with each value in matrix? 

+ +

I can use PrintInfo to print basic information, but it only gives +me number of rows and columns and nnz

+ +

 

+ +

A9: Our recommendation is to use SaveGathered(…) to dump the whole matrix into a file in triples +(matrix market) format. For vectors, we have a much much faster version:  FullyDistVec::ParallelWrite (…)

+ +

 

+ +

---

+ +

 

+ +

Q10: Does CombBLAS code run on any +graph size or there is some limitation on the dimension of the matrix A. I mean +should it be a multiple of sqrt(p) where p is total number of processors. 

+ +

 

+ +

A10: No, the matrix dimension does not have to be a multiple +of sqrt(p) but it should be bigger than sqrt(p). In other words you can have a +5x5 matrix on 4 processors but not on 36 processors. We don't really see the +point of using more than |V|^2 processors.

+ +

 

+ +

---

+ +

 

+ +

Q11: My comparison results on real graph inputs revealed +something weird. In input loc-gowalla, how can 16 processors time(called +time_16) and 

+ +

64 processors time(called time_64) which time_64*4<time_16 + which is more than linear scale? 

+ +

 

+ +

A11: The complexity of the parallel algorithm drops as +sub-matrices owned by each processor gets sparser. In particular, it is +proportional to O(flops x log(ni)) where ni is the size of the intersection of +the set of nonzero columns of Aik and nonzero rows of Bkj for A*B. What might +happen as p increases is that there is a phase transition that makes ni drop +significantly for your input (for p=64, each sub-matrix will have only ~1.2 +nonzeros per row or column). More details are in the SISC +paper and the references therein. Hope this makes sense. This is why +I don't suggest people use CombBLAS for small p (< 40) because it is not on +the top of its game for small number of processors. 

+ +

 

+ +

--- 

+ +

 

+ +

Q12: Should the input file have nodes numbered from 1 or it +is fine if the nodes are numbered from 0?

+ +

 

+ +

A12: If you're using the human readable matrix market format +as your input, then it should be 1-indexed. 

+ +

 

+ +

---

+ +

 

+ +

Q13: I'm wondering for breadth-first-search, under the hood +does the matrix-vector multiplication method change based on the sparsity of +the frontier vector, or does the underlying matrix-vector multiplication assume +the frontier is always sparse?

+ +

 

+ +

A13: Depending on your definition of sparseness, the +frontier is almost always sparse. We use the pragmatic definition of +"sparse" in the sense that a vector is sparse if it is worth taking +advantage of the sparsity in there. I'd guess, for a dense vector assumption to +be competitive, it would have to have at least 1/3 of its potential locations +nonzero. However, I might be wrong (and you're welcome to prove me wrong). To +answer your question more directly, CombBLAS supports both dense and sparse +right hand side vectors, but the specific BFS implementation does not +adapt. 

+ +

 

+ +

---

+ +

 

+ +

Q14: Could you briefly explain the difference in your +implementations of matrix-sparse vector and matrix-dense vector multiply? For +example, is the sparse vector case a write-based approach: Every element +updates all of its neighbors (from a graph-theoretic standpoint) locations in +the output vector; and the dense vector case a read-based approach: Every +element reads some value from each of its neighbors and updates its own entry +in the resulting vector?

+ +

 

+ +

A14: Sparse matrix-sparse vector is "right hand side +vector structure" driven. In y = A*x, for each nonzero x_i, we scale the +column A(:,i) with that and merge the scaled sparse columns results into y. The +computation boils down into merging sparse columns into one. Combinatorial +BLAS is a matrix-vector based library, so thinking in +terms of updates on single entries is probably not the right abstraction.

+ +

 

+ +

Sparse matrix-dense vector is slightly different in the sense that +it is driven by the matrix structure; you basically stream the matrix. The +correctness of both operations are handled by a SPA-like or heap-like data +structure that merges multiple intermediate values contributing to the same +output location; no atomics are used.

+ +

 

+ +

--- 

+ +

 

+ +

Q15: I would like to get your opinion +on how sparse-matrix based implementations compare with more native +implementations

+ +

 

+ +

A15: Sparse matrix abstraction, like +any abstraction, will leave some performance on the table. In particular it is +prone to performing extra passes over data or creating extra temporaries (if +you've ever programmed in Matlab; this is similar). On the other hand, sparse +matrix abstraction gives you "primitives" to implement graph +"algorithms" as opposed to the algorithms themselves. For instance, +CombBLAS has sparse matrix x sparse vector over a semiring as opposed to BFS, +because now using the same primitive one can implement MIS (maximal independent +set) too, only by changing the semiring. Or one can perform run time +filtering on edges based on the attributes, similarly by changing the semiring +functions (therefore extending functionality to semantic graphs). Indeed this +is what we've done in our upcoming IPDPS'13 paper.

+ +

 

+ +

---

+ +

 

+ +

Q16: Is there an effort to incorporate the bottom-up BFS of +Scott Beamer into CombBLAS?

+ +

 

+ +

A16: Yes, it is already done. Just use the dobfs executable +(made from DirOptBFS.cpp).

+ +

 

+ +

---

+ +

 

+ +

Q17: My serial code is faster than CombBLAS on a single +core.

+ +

 

+ +

A17: I believe that. CombBLAS targets +"scalability", not optimizing the single core performance.

+ +

 

+ +

Examples:

+ +

- think about the 2D BFS. CombBLAS does not use a CSR like data +structure because that is not memory scalable due to problems of hypersparsity in large concurrencies. Instead +CombBLAS opts to use a slower (about 2x around 1000 cores) but memory +scalable format called DCSC.  

+ +

- think about betweenness centrality which uses sparse +matrix-matrix multiply. CombBLAS doesn't use the fastest serial algorithm as +its subroutine because it doesn't  scale to thousands of cores. Instead it +uses a outer-product algorithm that is significantly slower for p=1, but scales +indefinitely.

+ +

 

+ +

--- 

+ +

 

+ +

Q18: Looking at the output of your Graph500 application, I +noticed a large number of self-edges removed. That’s very interesting.

+ +

 

+ +

A18: The duplicate edges problem is inherent to the R-MAT +generator on large scale, unless some special kind of noise is added. Check +here for a great analysis of this phenomenon: http://arxiv.org/abs/1102.5046

+ +

 

+ +

---

+ +

 

+ +

Q19: How are you counting the number of edges traversed in +Graph500? Is this still using the original verify.c file provided with the +reference version of the Graph500 benchmark and passing in the parent tree?

+ +

 

+ +

A19: It is calculated by summing the +degrees of the discovered vertices using EWiseMult(…) +followed by a Reduce(…). Degrees are pre-symmetrization +(original edges), so we're not over-counting. However, we count self-loops and +duplicates as mentioned in the benchmark specs.

+ +

 

+ +

---

+ +

 

+ +

Q20: My computations +finishes fine but I get an “Attempting to use an MPI routine after finalizing +MPICH” afterwards.

+ +

 

+ +

A20: To avoid the +finalization error, please imitate an example such as MultTest.cpp: http://gauss.cs.ucsb.edu/~aydin/CombBLAS/html/_mult_test_8cpp_source.html

+ +

The curly brackets around the code are intentional. Since +distributed objects have MPI related pointers in them, those pointers are +released once the destructors are called. In C++ (at least until C++11) there +isn’t a good way to call the destructor manually, so the destructor is called +immediately before the program exists, which is after the MPI_Finalize. Since +the MPI related objects are destructed after MPI_Finalize, you see this error. +Try the curly brackets approach.

+ +

 

+ +

Go back +to the the Combinatorial BLAS home page.

+ +
+ + + + diff --git a/ReleaseTests/CMakeLists.txt b/ReleaseTests/CMakeLists.txt index e37c57d1..605c0395 100644 --- a/ReleaseTests/CMakeLists.txt +++ b/ReleaseTests/CMakeLists.txt @@ -1,7 +1,13 @@ # Top level directory has the include files +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) ADD_EXECUTABLE( MultTiming MultTiming.cpp ) ADD_EXECUTABLE( MultTest MultTest.cpp ) +cuda_add_executable( MultTimingCUDA MultTimingCUDA.cu) +target_compile_options(MultTimingCUDA PRIVATE -Mlarge_arrays) +cuda_add_executable( MultAccuracyCUDA MultAccuracyCUDA.cu) + + ADD_EXECUTABLE( ReduceTest ReduceTest.cpp ) ADD_EXECUTABLE( TransposeTest TransposeTest.cpp ) ADD_EXECUTABLE( IteratorTest IteratorTest.cpp ) @@ -21,7 +27,11 @@ ADD_EXECUTABLE( KTipsTest KTipsTest.cpp ) TARGET_LINK_LIBRARIES( MultTiming CombBLAS) TARGET_LINK_LIBRARIES( MultTest CombBLAS) -TARGET_LINK_LIBRARIES( ReduceTest CombBLAS) + + +TARGET_LINK_LIBRARIES( MultTimingCUDA CombBLAS) +TARGET_LINK_LIBRARIES( MultAccuracyCUDA CombBLAS) +TARGET_LINK_LIBRARIES( ReduceTest CombBLAS MPI::MPI_CXX) TARGET_LINK_LIBRARIES( TransposeTest CombBLAS) TARGET_LINK_LIBRARIES( IteratorTest CombBLAS) TARGET_LINK_LIBRARIES( IndexingTest CombBLAS) @@ -40,6 +50,7 @@ TARGET_LINK_LIBRARIES( KTipsTest CombBLAS) ADD_TEST(NAME GenMMWrite_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $ 20 16 1 scale20_ef16_symmetric.mtx) ADD_TEST(NAME Multiplication_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $ ../TESTDATA/rmat_scale16_A.mtx ../TESTDATA/rmat_scale16_B.mtx ../TESTDATA/rmat_scale16_productAB.mtx ../TESTDATA/x_65536_halfdense.txt ../TESTDATA/y_65536_halfdense.txt ) +ADD_TEST(NAME Multiplication_Test_CUDA COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $ ../TESTDATA/rmat_scale16_A.mtx ../TESTDATA/rmat_scale16_B.mtx ../TESTDATA/rmat_scale16_productAB.mtx ../TESTDATA/x_65536_halfdense.txt ../TESTDATA/y_65536_halfdense.txt ) ADD_TEST(NAME SpGEMM3D_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 16 $ ../TESTDATA/rmat_scale16_A.mtx ../TESTDATA/rmat_scale16_B.mtx ../TESTDATA/rmat_scale16_productAB.mtx ) ADD_TEST(NAME HashSpGEMMTest COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 $ ../TESTDATA/rmat_scale16_A.mtx ../TESTDATA/rmat_scale16_B.mtx ../TESTDATA/rmat_scale16_productAB.mtx ) ADD_TEST(NAME Reduction_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $ ../TESTDATA/sprand10000 ../TESTDATA/sprand10000_sumcols ../TESTDATA/sprand10000_sumrows) @@ -48,4 +59,4 @@ ADD_TEST(NAME Transpose_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $ ../TESTDATA B_100x100.txt B_10x30_Indexed.txt rand10outta100.txt rand30outta100.txt) ADD_TEST(NAME SpAsgn_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $ ../TESTDATA A_100x100.txt A_with20x30hole.txt dense_20x30matrix.txt A_wdenseblocks.txt 20outta100.txt 30outta100.txt) ADD_TEST(NAME GalerkinNew_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $ ../TESTDATA/grid3d_k5.txt ../TESTDATA/offdiag_grid3d_k5.txt ../TESTDATA/diag_grid3d_k5.txt ../TESTDATA/restrict_T_grid3d_k5.txt) -ADD_TEST(NAME FindSparse_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $ ../TESTDATA findmatrix.txt) +ADD_TEST(NAME FindSparse_Test COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 $ ../TESTDATA findmatrix.txt) \ No newline at end of file diff --git a/ReleaseTests/GALERKIN/failed/galerkin1024.1284895.err b/ReleaseTests/GALERKIN/failed/galerkin1024.1284895.err index 06917ac1..fc3c51be 100644 --- a/ReleaseTests/GALERKIN/failed/galerkin1024.1284895.err +++ b/ReleaseTests/GALERKIN/failed/galerkin1024.1284895.err @@ -1,23 +1,23 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/galerkin_scale23_order4 -Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. [i140-203:00879] *** Process received signal *** [i140-203:00879] Signal: Segmentation fault (11) [i140-203:00879] Signal code: Address not mapped (1) diff --git a/ReleaseTests/GalerkinResults/Galerkin/failed/galerkin1024.1284895.err b/ReleaseTests/GalerkinResults/Galerkin/failed/galerkin1024.1284895.err index 06917ac1..fc3c51be 100644 --- a/ReleaseTests/GalerkinResults/Galerkin/failed/galerkin1024.1284895.err +++ b/ReleaseTests/GalerkinResults/Galerkin/failed/galerkin1024.1284895.err @@ -1,23 +1,23 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/galerkin_scale23_order4 -Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. [i140-203:00879] *** Process received signal *** [i140-203:00879] Signal: Segmentation fault (11) [i140-203:00879] Signal code: Address not mapped (1) diff --git a/ReleaseTests/GalerkinResults/failed/galerkin1024.1284895.err b/ReleaseTests/GalerkinResults/failed/galerkin1024.1284895.err index 06917ac1..fc3c51be 100644 --- a/ReleaseTests/GalerkinResults/failed/galerkin1024.1284895.err +++ b/ReleaseTests/GalerkinResults/failed/galerkin1024.1284895.err @@ -1,23 +1,23 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/galerkin_scale23_order4 -Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. [i140-203:00879] *** Process received signal *** [i140-203:00879] Signal: Segmentation fault (11) [i140-203:00879] Signal code: Address not mapped (1) diff --git a/ReleaseTests/InducedSubgraphsTest.cpp b/ReleaseTests/InducedSubgraphsTest.cpp index 22f6a446..9e6aef17 100644 --- a/ReleaseTests/InducedSubgraphsTest.cpp +++ b/ReleaseTests/InducedSubgraphsTest.cpp @@ -1,49 +1,56 @@ -#include -#include -#include +#include "CombBLAS/CombBLAS.h" #include -#include +#include +#include +#include #include -#include "CombBLAS/CombBLAS.h" - -int main(int argc, char *argv[]) -{ - int nprocs, myrank; - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - if (argc < 3) { - if (!myrank) - std::cerr << "Usage: ./Subgraphs2ProcsTest " << std::endl; - MPI_Finalize(); - return -1; - } - { - if (!myrank) std::cerr << "processor grid: (" << std::sqrt(nprocs) << " x " << std::sqrt(nprocs) << ")" << std::endl; - - std::shared_ptr fullWorld; - fullWorld.reset(new combblas::CommGrid(MPI_COMM_WORLD, 0, 0)); - - combblas::SpParMat > A(fullWorld); - combblas::FullyDistVec assignments(A.getcommgrid()); - - A.ParallelReadMM(std::string(argv[1]), true, combblas::maximum()); - assignments.ParallelRead(std::string(argv[2]), true, combblas::maximum()); - - std::vector local_idx_map; - - combblas::SpCCols locmat = A.InducedSubgraphs2Procs(assignments, local_idx_map); +#include - for (auto colit = locmat.begcol(); colit != locmat.endcol(); ++colit) { - for (auto nzit = locmat.begnz(colit); nzit != locmat.endnz(colit); ++nzit) { - std::cout << myrank << ": " << local_idx_map[nzit.rowid()]+1 << "\t" << local_idx_map[colit.colid()]+1 << "\t" << nzit.value() << std::endl; - } - } - std::cout << std::endl; +int main(int argc, char *argv[]) { + int nprocs, myrank; + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (argc < 3) { + if (!myrank) + std::cerr << "Usage: ./Subgraphs2ProcsTest " + << std::endl; + MPI_Finalize(); + return -1; + } + { + if (!myrank) + std::cerr << "processor grid: (" << std::sqrt(nprocs) << " x " + << std::sqrt(nprocs) << ")" << std::endl; + + std::shared_ptr fullWorld; + fullWorld.reset(new combblas::CommGrid(MPI_COMM_WORLD, 0, 0)); + + combblas::SpParMat> A( + fullWorld); + combblas::FullyDistVec assignments(A.getcommgrid()); + + A.ParallelReadMM(std::string(argv[1]), true, combblas::maximum()); + assignments.ParallelRead(std::string(argv[2]), true, + combblas::maximum()); + + std::vector local_idx_map; + + combblas::SpCCols locmat = + A.InducedSubgraphs2Procs(assignments, local_idx_map); + + for (auto colit = locmat.begcol(); colit != locmat.endcol(); ++colit) { + for (auto nzit = locmat.begnz(colit); nzit != locmat.endnz(colit); + ++nzit) { + std::cout << myrank << ": " << local_idx_map[nzit.rowid()] + 1 << "\t" + << local_idx_map[colit.colid()] + 1 << "\t" << nzit.value() + << std::endl; + } } + std::cout << std::endl; + } - MPI_Finalize(); - return 0; + MPI_Finalize(); + return 0; } diff --git a/ReleaseTests/MultAccuracyCUDA.cu b/ReleaseTests/MultAccuracyCUDA.cu new file mode 100644 index 00000000..a9ba1e0a --- /dev/null +++ b/ReleaseTests/MultAccuracyCUDA.cu @@ -0,0 +1,172 @@ +/****************************************************************/ +/* Parallel Combinatorial BLAS Library (for Graph Computations) */ +/* version 1.6 -------------------------------------------------*/ +/* date: 6/15/2017 ---------------------------------------------*/ +/* authors: Ariful Azad, Aydin Buluc --------------------------*/ +/****************************************************************/ +/* + Copyright (c) 2010-2017, The Regents of the University of California + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + */ + +// #include + +#ifdef __CUDACC__ + +#include +#include +#include +#include +#include +#include +#include +#include "CombBLAS/CombBLAS.h" +// #include "../include/GALATIC/source/device/Multiply.cuh" + +using namespace std; +using namespace combblas; + +#ifdef TIMING +double cblas_alltoalltime; +double cblas_allgathertime; +#endif + +#ifdef _OPENMP +int cblas_splits = omp_get_max_threads(); +#else +int cblas_splits = 1; +#endif + +#define ElementType double +int ITERATIONS = 50; + +// Simple helper class for declarations: Just the numerical type is templated +// The index type and the sequential matrix type stays the same for the whole code +// In this case, they are "int" and "SpDCCols" +template +class PSpMat +{ +public: + typedef SpDCCols DCCols; + typedef SpParMat MPI_DCCols; +}; + +// Outline of debug stages +// stage = 0: LocalHybrid does not run/immediately returns +// stage = 1: LocalHybrid mallocs and transposes as needed, but returns immediately after +// stage = 2: LocalHybrid runs the kernel, but does not perform cleanup +// stage = 3: Full run of LocalHybrid +// stages 1 & 2 may lead to memory leaks, be aware on memory limited systems +int main(int argc, char *argv[]) +{ +#ifdef GPU_ENABLED +// SpParHelper::Print("GPU ENABLED\n"); +#endif + int nprocs, myrank; + int host_rank; + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + typedef PlusTimesSRing PTDOUBLEDOUBLE; + + if (argc < 3) + { + if (myrank == 0) + { + cout << "Usage: ./MultTest " << endl; + cout << ",, are absolute addresses, and files should be in triples format" << endl; + } + MPI_Finalize(); + return -1; + } + { + string Aname(argv[1]); + string Bname(argv[2]); + + if (myrank == 0 || nprocs == 1) + { + std::cout << Aname << std::endl; + std::cout << Bname << std::endl; + } + typedef PlusTimesSRing MinPlusSRing; + typedef SelectMaxSRing SR; + + shared_ptr fullWorld; + fullWorld.reset(new CommGrid(MPI_COMM_WORLD, 0, 0)); + + std::cout << "Constructing objects:" << std::endl; + // construct objects + PSpMat::MPI_DCCols A(fullWorld); + PSpMat::MPI_DCCols B(fullWorld); + PSpMat::MPI_DCCols C(fullWorld); + PSpMat::MPI_DCCols CControl(fullWorld); + + A.ParallelReadMM(Aname, true, maximum()); +#ifndef NOGEMM + B.ParallelReadMM(Bname, true, maximum()); + +#endif + A.PrintInfo(); + +#ifndef NOGEMM + C = Mult_AnXBn_DoubleBuff_CUDA::DCCols>(A, B); + cudaDeviceSynchronize(); + HANDLE_ERROR(cudaGetLastError()); + C.PrintInfo(); + cudaDeviceSynchronize(); + { + CControl = Mult_AnXBn_DoubleBuff::DCCols>(A, B); + C.PrintInfo(); + if (CControl == C) + { + SpParHelper::Print("Double buffered multiplication working correctly\n"); + } + else + { + SpParHelper::Print("ERROR in double CUDA buffered multiplication, from CPU!\n"); + A.PrintInfo(); + C.PrintInfo(); + CControl.PrintInfo(); + SpDCCols spdcsc = C.seq(); + Dcsc *dcsc = C.seq().GetDCSC(); + double maxdiff = 0; + double a = 0; + double b = 0; + for (int i = 0; i < spdcsc.getnnz(); ++i) + { + if (abs(dcsc->numx[i] - CControl.seq().GetDCSC()->numx[i]) > maxdiff) + { + maxdiff = abs(dcsc->numx[i] - CControl.seq().GetDCSC()->numx[i]); + a = dcsc->numx[i]; + b = CControl.seq().GetDCSC()->numx[i]; + } + } + std::cout << "MAX DIFF = " << maxdiff << std::endl; + std::cout << a << std::endl; + std::cout << b << std::endl; + } + } + } +#endif + +MPI_Finalize(); +return 0; +} +#endif \ No newline at end of file diff --git a/ReleaseTests/MultTimingCUDA.cu b/ReleaseTests/MultTimingCUDA.cu new file mode 100644 index 00000000..09d1e477 --- /dev/null +++ b/ReleaseTests/MultTimingCUDA.cu @@ -0,0 +1,280 @@ +/****************************************************************/ +/* Parallel Combinatorial BLAS Library (for Graph Computations) */ +/* version 1.6 -------------------------------------------------*/ +/* date: 6/15/2017 ---------------------------------------------*/ +/* authors: Ariful Azad, Aydin Buluc --------------------------*/ +/****************************************************************/ +/* + Copyright (c) 2010-2017, The Regents of the University of California + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + */ + +// #include + +#ifdef __CUDACC__ + +#include +#include +#include +#include +#include +#include +#include +#include "CombBLAS/CombBLAS.h" +// #include "../include/GALATIC/source/device/Multiply.cuh" + +using namespace std; +using namespace combblas; + +#ifdef TIMING +double cblas_alltoalltime; +double cblas_allgathertime; +#endif + +#ifdef _OPENMP +int cblas_splits = omp_get_max_threads(); +#else +int cblas_splits = 1; +#endif + +#define ElementType double +int ITERATIONS = 50; + +// Simple helper class for declarations: Just the numerical type is templated +// The index type and the sequential matrix type stays the same for the whole code +// In this case, they are "int" and "SpDCCols" +template +class PSpMat +{ +public: + typedef SpDCCols DCCols; + typedef SpParMat MPI_DCCols; +}; + +// Outline of debug stages +// stage = 0: LocalHybrid does not run/immediately returns +// stage = 1: LocalHybrid mallocs and transposes as needed, but returns immediately after +// stage = 2: LocalHybrid runs the kernel, but does not perform cleanup +// stage = 3: Full run of LocalHybrid +// stages 1 & 2 may lead to memory leaks, be aware on memory limited systems +int main(int argc, char *argv[]) +{ +#ifdef GPU_ENABLED +// SpParHelper::Print("GPU ENABLED\n"); +#endif + int nprocs, myrank; + int host_rank; + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + typedef PlusTimesSRing PTDOUBLEDOUBLE; + + + if (argc < 4) + { + if (myrank == 0) + { + cout << "Usage: ./MultTest " << endl; + cout << ",, are absolute addresses, and files should be in triples format" << endl; + } + MPI_Finalize(); + return -1; + } + { + string ITERS(argv[1]); + string COMMTEST(argv[2]); + string Aname(argv[3]); + string Bname(argv[4]); + + if(myrank == 0 || nprocs == 1) { + std::cout << Aname << std::endl; + std::cout << Bname << std::endl; + std::cout << nprocs << std::endl; + std::string filename = "output" + Aname.substr(0, Aname.length() - 4) + ".txt"; + + FILE *f = fopen(filename.c_str(), "a"); + if(f==NULL){printf("failed to open file: permission issue ?\n");exit(1);} + // cout << "Double buffered CUDA multiplications finished" << endl; + fprintf(f, "Input A: %s, with NPROCS: %i\n", Aname.c_str(), nprocs); + fclose(f); + } + ITERATIONS = std::stoi(ITERS); + + bool COMMTESTON = std::stoi(COMMTEST) > 0; + //if(!COMMTESTON) GPUTradeoff = 1024 * 100 * 500; + MPI_Barrier(MPI_COMM_WORLD); + typedef PlusTimesSRing MinPlusSRing; + typedef SelectMaxSRing SR; + + shared_ptr fullWorld; + fullWorld.reset(new CommGrid(MPI_COMM_WORLD, 0, 0)); + + // construct objects + PSpMat::MPI_DCCols A(fullWorld); + PSpMat::MPI_DCCols B(fullWorld); + PSpMat::MPI_DCCols C(fullWorld); + + A.ParallelReadMM(Aname, true, maximum()); +#ifndef NOGEMM + B.ParallelReadMM(Bname, true, maximum()); + +#endif + //A.PrintInfo(); + +#ifndef NOGEMM + double t3 = MPI_Wtime(); + C = Mult_AnXBn_DoubleBuff_CUDA::DCCols>(A, B); + cudaDeviceSynchronize(); + HANDLE_ERROR(cudaGetLastError()); + double t4 = MPI_Wtime(); + std::cout << "Time taken: " << t4 - t3 << std::endl; + + C.PrintInfo(); + cudaDeviceSynchronize(); + { // force the calling of C's destructor + t3 = MPI_Wtime(); + //C = Mult_AnXBn_DoubleBuff::DCCols>(A, B); + C = Mult_AnXBn_DoubleBuff::DCCols>(A, B); + t4 = MPI_Wtime(); + std::cout << "Time taken: " << t4 - t3 << std::endl; + C.PrintInfo(); + } + MPI_Barrier(MPI_COMM_WORLD); + MPI_Pcontrol(1, "SpGEMM_DoubleBuff"); + double t1 = MPI_Wtime(); // initilize (wall-clock) timer + for (int i = 0; i < ITERATIONS; i++) + { + C = Mult_AnXBn_DoubleBuff::DCCols>(A, B); + } + MPI_Barrier(MPI_COMM_WORLD); + double t2 = MPI_Wtime(); + MPI_Pcontrol(-1, "SpGEMM_DoubleBuff"); + if (myrank == 0 || nprocs == 1) + { + std::string filename = "output" + Aname.substr(0,Aname.length() - 4) + ".txt"; + //std::cout << filename.c_str() << std::endl; + FILE *f = fopen(filename.c_str(), "a"); + if(f==NULL){printf("failed to open file: permission issue ?\n");exit(1);} + // cout << "Double buffered CUDA multiplications finished" << endl; + fprintf(f, "CPU Time: %.6lf\n", (t2 - t1) / ((double) ITERATIONS)); + fclose(f); + } + int maxhits = 0; + for (int j = 0; j < 500; ++j) + { + //if(!COMMTESTON) j = 500; + //std::cout << j << std::endl; + size_t free, total; + int id; + MPI_Comm_rank(MPI_COMM_WORLD, &id); + cudaMemGetInfo(&free, &total); + //std::cout << "GPU " << id << " memory: free=" << free << ", total=" << total << std::endl; + + commtime = 0; + comms = 0; + datahits = 0; + rowshits = 0; + colhits = 0; + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + MPI_Pcontrol(1, "SpGEMM_DoubleBuff"); + { + C = Mult_AnXBn_DoubleBuff_CUDA::DCCols>(A, B); + } + + int svdhits = datahits + rowshits + colhits; + int commper = comms; + comms = 0; + datahits = 0; + rowshits = 0; + colhits = 0; + GPUTradeoff = 1024 * 100 * j; + MPI_Barrier(MPI_COMM_WORLD); + MPI_Pcontrol(1, "SpGEMM_DoubleBuff"); + { + C = Mult_AnXBn_DoubleBuff_CUDA::DCCols>(A, B); + } + + bool allt; + int nnprocs; + MPI_Comm_size(MPI_COMM_WORLD, &nnprocs); + int newhits = datahits + rowshits + colhits; + if (myrank == 0) { + for(int i = 1; i < nnprocs; ++i) { + MPI_Status idc; + int recv; + MPI_Recv(&recv, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &idc); + svdhits += recv; + MPI_Recv(&recv, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &idc); + newhits += recv; + } + } else { + MPI_Send(&svdhits, 1, MPI_INT, 0, 0, MPI_COMM_WORLD); + MPI_Send(&newhits, 1, MPI_INT, 0, 0, MPI_COMM_WORLD); + } + allt = j > 0 && svdhits == newhits; + if(j == 0) maxhits = newhits; + MPI_Bcast(&allt, 1, MPI_INT, 0, MPI_COMM_WORLD); + if(allt) { + continue; + } + comms = 0; + datahits = 0; + rowshits = 0; + colhits = 0; + commtime = 0; + comptime = 0; + checkingTime = 0; + // std::cout << "Running with tradeoff of " << 100 * j << "KB" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + MPI_Pcontrol(1, "SpGEMM_DoubleBuff"); + t1 = MPI_Wtime(); // initilize (wall-clock) timer + + for (int i = 0; i < ITERATIONS; i++) + { + // std::cout << "--------------NEW ITER------------" << std::endl; + C = Mult_AnXBn_DoubleBuff_CUDA::DCCols>(A, B); + } + MPI_Barrier(MPI_COMM_WORLD); + t2 = MPI_Wtime(); + MPI_Pcontrol(-1, "SpGEMM_DoubleBuff"); + commper = 3 * nnprocs * nnprocs; + if (myrank == 0 || nprocs == 1) + { + std::string filename = "output" + Aname.substr(0,Aname.length() - 4) + ".txt"; + //std::cout << filename.c_str() << std::endl; + FILE *f = fopen(filename.c_str(), "a"); + if(f==NULL){printf("failed to open file: permission issue ?\n");exit(1);} + // cout << "Double buffered CUDA multiplications finished" << endl; + printf("%i,%i,%i,%.6lf,%.6lf,%.6lf,%.6lf\n", GPUTradeoff / 1024, newhits,maxhits, (t2 - t1) / (double)ITERATIONS, (commtime) / (double)ITERATIONS,comptime / (double) ITERATIONS, checkingTime / (double) ITERATIONS); + fprintf(f, "%i,%i,%i,%.6lf,%.6lf,%.6lf\n", GPUTradeoff / 1024, newhits,maxhits, (t2 - t1) / (double)ITERATIONS, (commtime) / (double)ITERATIONS,comptime / (double) ITERATIONS); + fclose(f); + } + if(!COMMTESTON) break; + if(!newhits) break; + if(nprocs == 1) break; + } +#endif + } + MPI_Finalize(); + return 0; +} + +#endif \ No newline at end of file diff --git a/ReleaseTests/NWAYSCALE22/btwcent1024.1246100.err b/ReleaseTests/NWAYSCALE22/btwcent1024.1246100.err index fd21442a..db062968 100644 --- a/ReleaseTests/NWAYSCALE22/btwcent1024.1246100.err +++ b/ReleaseTests/NWAYSCALE22/btwcent1024.1246100.err @@ -1,154 +1,154 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i153-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i141-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i109-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i112-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i171-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i153-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i143-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i143-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i107-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i127-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i153-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i141-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i131-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i108-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i107-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i154-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i123-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i107-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i102-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i143-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i142-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i138-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i127-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i153-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i156-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i103-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i153-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i130-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i107-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i127-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i108-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i156-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i169-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i181-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i169-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i109-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i135-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i157-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i138-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i103-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i160-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i174-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i169-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i153-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i141-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i109-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i112-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i153-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i127-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i153-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i141-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i131-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i108-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i154-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i123-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i102-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i142-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i138-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i127-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i153-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i156-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i103-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i153-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i130-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i127-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i108-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i156-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i169-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i181-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i169-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i109-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i135-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i157-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i138-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i103-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i160-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i174-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i169-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/NWAYSCALE22/btwcent1024.1256708.err b/ReleaseTests/NWAYSCALE22/btwcent1024.1256708.err index b40ad933..4f43ecb7 100644 --- a/ReleaseTests/NWAYSCALE22/btwcent1024.1256708.err +++ b/ReleaseTests/NWAYSCALE22/btwcent1024.1256708.err @@ -1,23 +1,23 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i161-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i157-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i103-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i112-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i107-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i135-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i162-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i102-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i157-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i103-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i112-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i135-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i102-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/NWAYSCALE22/btwcent256.1246095.err b/ReleaseTests/NWAYSCALE22/btwcent256.1246095.err index f877d87b..0f88726b 100644 --- a/ReleaseTests/NWAYSCALE22/btwcent256.1246095.err +++ b/ReleaseTests/NWAYSCALE22/btwcent256.1246095.err @@ -1,27 +1,27 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i176-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i102-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i162-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i171-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i127-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i174-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i174-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i109-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i103-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i135-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i131-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i107-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i102-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i127-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i174-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i174-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i109-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i103-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i135-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i131-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/NWAYSCALE22/btwcent256.1246099.err b/ReleaseTests/NWAYSCALE22/btwcent256.1246099.err index 030fbfa1..c112d6db 100644 --- a/ReleaseTests/NWAYSCALE22/btwcent256.1246099.err +++ b/ReleaseTests/NWAYSCALE22/btwcent256.1246099.err @@ -1,54 +1,54 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i172-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i143-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i112-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i108-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i157-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i149-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i143-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i102-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i102-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i154-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i112-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i142-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i127-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i112-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i108-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i157-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i149-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i102-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i102-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i154-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i112-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i142-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i127-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/NWAYSCALE22/btwcent256.1246103.err b/ReleaseTests/NWAYSCALE22/btwcent256.1246103.err index 2287583e..f42eecb7 100644 --- a/ReleaseTests/NWAYSCALE22/btwcent256.1246103.err +++ b/ReleaseTests/NWAYSCALE22/btwcent256.1246103.err @@ -1,80 +1,80 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i124-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i152-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i127-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i157-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i171-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i154-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i127-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i149-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i123-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i138-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i141-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i123-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i162-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i162-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i127-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i160-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i169-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i169-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i102-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i152-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i127-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i157-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i154-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i127-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i149-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i123-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i138-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i141-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i123-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i127-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i160-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i169-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i169-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i102-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/NWAYSCALE22/btwcent64.1246094.err b/ReleaseTests/NWAYSCALE22/btwcent64.1246094.err index 19543d47..8a442939 100644 --- a/ReleaseTests/NWAYSCALE22/btwcent64.1246094.err +++ b/ReleaseTests/NWAYSCALE22/btwcent64.1246094.err @@ -1,8 +1,8 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i152-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i152-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/NWAYSCALE22/btwcent64.1246098.err b/ReleaseTests/NWAYSCALE22/btwcent64.1246098.err index 23f48f29..0457d036 100644 --- a/ReleaseTests/NWAYSCALE22/btwcent64.1246098.err +++ b/ReleaseTests/NWAYSCALE22/btwcent64.1246098.err @@ -1,6 +1,6 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i145-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i169-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i169-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/NWAYSCALE22/btwcent64.1246102.err b/ReleaseTests/NWAYSCALE22/btwcent64.1246102.err index d84d6b58..759354b2 100644 --- a/ReleaseTests/NWAYSCALE22/btwcent64.1246102.err +++ b/ReleaseTests/NWAYSCALE22/btwcent64.1246102.err @@ -1,14 +1,14 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i132-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i135-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i135-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent100.1243955.err b/ReleaseTests/SCALE21RMATRMAT/btwcent100.1243955.err index 5b98b874..36ccfa98 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent100.1243955.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent100.1243955.err @@ -1,5 +1,5 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i150-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i131-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i131-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent1024.1243969.err b/ReleaseTests/SCALE21RMATRMAT/btwcent1024.1243969.err index 86acb051..8e3a6769 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent1024.1243969.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent1024.1243969.err @@ -1,44 +1,44 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i168-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i162-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i162-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i162-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i141-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i152-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i107-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i169-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i102-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i174-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i174-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i149-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i156-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i152-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i169-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i141-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i152-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i169-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i102-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i174-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i174-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i149-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i156-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i152-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i169-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent121.1243956.err b/ReleaseTests/SCALE21RMATRMAT/btwcent121.1243956.err index f2c7787a..d3e7c3f8 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent121.1243956.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent121.1243956.err @@ -1,7 +1,7 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i122-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i109-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i109-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent1225.1243970.err b/ReleaseTests/SCALE21RMATRMAT/btwcent1225.1243970.err index 95484bd1..a282e989 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent1225.1243970.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent1225.1243970.err @@ -1,63 +1,63 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i164-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i118-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i131-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i123-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i130-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i181-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i171-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i112-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i154-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i156-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i118-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i107-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i142-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i118-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i131-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i123-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i130-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i181-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i112-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i154-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i156-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i118-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i142-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent144.1243957.err b/ReleaseTests/SCALE21RMATRMAT/btwcent144.1243957.err index 2d320cb8..0341646b 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent144.1243957.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent144.1243957.err @@ -1,5 +1,5 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i152-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i152-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent1600.1243972.err b/ReleaseTests/SCALE21RMATRMAT/btwcent1600.1243972.err index 69c2bd4a..51c6b3bb 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent1600.1243972.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent1600.1243972.err @@ -1,77 +1,77 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i129-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i123-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i156-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i141-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i109-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i138-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i154-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i103-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i143-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i141-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i131-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i109-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i112-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i160-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i118-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i142-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i141-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i112-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i112-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i102-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i123-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i156-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i141-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i109-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i138-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i154-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i103-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i141-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i131-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i109-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i112-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i160-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i118-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i142-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i141-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i112-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i112-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i102-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent169.1243958.err b/ReleaseTests/SCALE21RMATRMAT/btwcent169.1243958.err index dc168e3b..86d6ed61 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent169.1243958.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent169.1243958.err @@ -1,8 +1,8 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i107-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i118-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i118-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent196.1243959.err b/ReleaseTests/SCALE21RMATRMAT/btwcent196.1243959.err index b719fcd3..9eab4595 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent196.1243959.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent196.1243959.err @@ -1,9 +1,9 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i116-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i174-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i109-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i174-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i109-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent225.1243960.err b/ReleaseTests/SCALE21RMATRMAT/btwcent225.1243960.err index 72cfd40c..5435dd41 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent225.1243960.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent225.1243960.err @@ -1,10 +1,10 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i134-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i149-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i138-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i109-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i149-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i138-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i109-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent25.1243329.err b/ReleaseTests/SCALE21RMATRMAT/btwcent25.1243329.err index 3f25fddb..a8cdcbff 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent25.1243329.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent25.1243329.err @@ -1,2 +1,2 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i114-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent256.1243962.err b/ReleaseTests/SCALE21RMATRMAT/btwcent256.1243962.err index 8daee5b7..eceadf6a 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent256.1243962.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent256.1243962.err @@ -1,9 +1,9 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i149-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i143-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i149-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent36.1243332.err b/ReleaseTests/SCALE21RMATRMAT/btwcent36.1243332.err index 943185a4..498c55e2 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent36.1243332.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent36.1243332.err @@ -1,2 +1,2 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i175-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent441.1243964.err b/ReleaseTests/SCALE21RMATRMAT/btwcent441.1243964.err index 729e4e0e..10499057 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent441.1243964.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent441.1243964.err @@ -1,22 +1,22 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i154-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i118-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i118-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i123-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i123-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i109-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i154-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i118-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i118-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i123-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i123-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i109-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent49.1243333.err b/ReleaseTests/SCALE21RMATRMAT/btwcent49.1243333.err index a6bf664a..8f7353ee 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent49.1243333.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent49.1243333.err @@ -1,3 +1,3 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i176-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent64.1243953.err b/ReleaseTests/SCALE21RMATRMAT/btwcent64.1243953.err index 286a18d2..0abbd7de 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent64.1243953.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent64.1243953.err @@ -1,3 +1,3 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i103-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i143-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i103-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE21RMATRMAT/btwcent81.1243954.err b/ReleaseTests/SCALE21RMATRMAT/btwcent81.1243954.err index 859a5a41..ba156daa 100644 --- a/ReleaseTests/SCALE21RMATRMAT/btwcent81.1243954.err +++ b/ReleaseTests/SCALE21RMATRMAT/btwcent81.1243954.err @@ -1,6 +1,6 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE21-RMAT/input1_0 /work/00919/tg459476/SCALE21-RMAT/input2_0 -Warning: Permanently added 'i132-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i181-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i118-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i181-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i118-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent100.1254792.err b/ReleaseTests/SCALE22RMATRMAT/btwcent100.1254792.err index 750a79cb..a9ec9e02 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent100.1254792.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent100.1254792.err @@ -1,4 +1,4 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i146-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent1024.1246086.err b/ReleaseTests/SCALE22RMATRMAT/btwcent1024.1246086.err index a2817758..8f65f85c 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent1024.1246086.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent1024.1246086.err @@ -1,33 +1,33 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i144-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i157-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i112-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i171-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i181-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i160-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i153-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i142-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i157-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i112-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i181-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i160-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i153-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i142-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent121.1246077.err b/ReleaseTests/SCALE22RMATRMAT/btwcent121.1246077.err index e61c7640..0d2f0ad6 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent121.1246077.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent121.1246077.err @@ -1,5 +1,5 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i143-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i127-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i127-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256722.err b/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256722.err index ffe42819..52a3260c 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256722.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256722.err @@ -1,20 +1,20 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i123-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i143-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i149-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i130-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i160-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i109-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i142-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i123-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i149-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i130-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i160-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i109-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i142-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256737.err b/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256737.err index e853b2ac..10f84598 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256737.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent1225.1256737.err @@ -1,29 +1,29 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i137-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i152-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i171-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i149-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i181-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i138-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i138-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i152-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i149-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i181-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i138-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i138-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent144.1246078.err b/ReleaseTests/SCALE22RMATRMAT/btwcent144.1246078.err index 41626c39..4e5682d4 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent144.1246078.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent144.1246078.err @@ -1,9 +1,9 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i117-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i118-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i108-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i108-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i118-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i108-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i108-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent1600.1246088.err b/ReleaseTests/SCALE22RMATRMAT/btwcent1600.1246088.err index 1373701b..9198e022 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent1600.1246088.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent1600.1246088.err @@ -1,78 +1,78 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i135-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i102-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i127-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i149-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i156-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i112-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i181-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i153-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i138-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i107-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i138-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i157-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i171-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i149-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i171-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i108-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i162-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i143-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i171-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i110-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i103-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i135-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i102-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i127-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i149-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i156-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i112-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i181-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i153-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i138-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i138-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i157-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i149-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i108-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i110-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i103-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent169.1246079.err b/ReleaseTests/SCALE22RMATRMAT/btwcent169.1246079.err index 8e9b9669..02a0f13d 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent169.1246079.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent169.1246079.err @@ -1,8 +1,8 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i178-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i141-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i141-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent196.1246080.err b/ReleaseTests/SCALE22RMATRMAT/btwcent196.1246080.err index 774a6453..2f1fc2fe 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent196.1246080.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent196.1246080.err @@ -1,13 +1,13 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i163-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i157-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i181-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i154-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i157-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i181-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i154-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent225.1246081.err b/ReleaseTests/SCALE22RMATRMAT/btwcent225.1246081.err index 7a11bb05..7989a7d9 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent225.1246081.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent225.1246081.err @@ -1,10 +1,10 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i162-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent36.1246072.err b/ReleaseTests/SCALE22RMATRMAT/btwcent36.1246072.err index 1a01c0ac..8216fe16 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent36.1246072.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent36.1246072.err @@ -1,3 +1,3 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i169-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i169-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent441.1246084.err b/ReleaseTests/SCALE22RMATRMAT/btwcent441.1246084.err index 3aca29a5..7a4d8e91 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent441.1246084.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent441.1246084.err @@ -1,17 +1,17 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i106-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i130-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i153-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i156-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i112-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i118-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i174-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i130-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i153-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i156-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i112-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i118-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i174-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent49.1246073.err b/ReleaseTests/SCALE22RMATRMAT/btwcent49.1246073.err index 78e1e9fe..d60854d2 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent49.1246073.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent49.1246073.err @@ -1,3 +1,3 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i153-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i153-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent625.1246085.err b/ReleaseTests/SCALE22RMATRMAT/btwcent625.1246085.err index 966d74a2..5cb3683c 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent625.1246085.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent625.1246085.err @@ -1,20 +1,20 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i130-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i108-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i108-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i130-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i174-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i153-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i153-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i130-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i108-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i108-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i130-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i174-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i153-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i153-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent64.1243334.err b/ReleaseTests/SCALE22RMATRMAT/btwcent64.1243334.err index 7e554618..9e0fbc4a 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent64.1243334.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent64.1243334.err @@ -1,3 +1,3 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i157-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i157-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent81.1246075.err b/ReleaseTests/SCALE22RMATRMAT/btwcent81.1246075.err index 9afcae33..49669163 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent81.1246075.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent81.1246075.err @@ -1,4 +1,4 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i148-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i128-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i128-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE22RMATRMAT/btwcent848.1256710.err b/ReleaseTests/SCALE22RMATRMAT/btwcent848.1256710.err index 54ae5504..7e59aa24 100644 --- a/ReleaseTests/SCALE22RMATRMAT/btwcent848.1256710.err +++ b/ReleaseTests/SCALE22RMATRMAT/btwcent848.1256710.err @@ -1,33 +1,33 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE22-RMAT/input1_0 /work/00919/tg459476/SCALE22-RMAT/input2_0 -Warning: Permanently added 'i106-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i181-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i119-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i173-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i103-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i181-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i157-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i116-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i159-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i152-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i181-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i119-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i173-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i103-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i181-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i157-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-203.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i116-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i159-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i152-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent1024.1258019.err b/ReleaseTests/SCALE23RMATRMAT/btwcent1024.1258019.err index 9756fcce..8e20c805 100644 --- a/ReleaseTests/SCALE23RMATRMAT/btwcent1024.1258019.err +++ b/ReleaseTests/SCALE23RMATRMAT/btwcent1024.1258019.err @@ -1,18 +1,18 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0 -Warning: Permanently added 'i104-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i181-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i147-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i154-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i130-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i160-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i154-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i181-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i147-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i154-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i130-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i160-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i154-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent1225.1256763.err b/ReleaseTests/SCALE23RMATRMAT/btwcent1225.1256763.err index 9df94063..0225f425 100644 --- a/ReleaseTests/SCALE23RMATRMAT/btwcent1225.1256763.err +++ b/ReleaseTests/SCALE23RMATRMAT/btwcent1225.1256763.err @@ -1,34 +1,34 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0 -Warning: Permanently added 'i101-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i179-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i134-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i175-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i154-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i101-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i113-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i104-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i129-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i143-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i142-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i156-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i179-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i134-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i175-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i154-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i101-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i113-206.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i104-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i129-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-404.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i143-110.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i142-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i156-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent144.1258024.err b/ReleaseTests/SCALE23RMATRMAT/btwcent144.1258024.err index 821f5473..813b8916 100644 --- a/ReleaseTests/SCALE23RMATRMAT/btwcent144.1258024.err +++ b/ReleaseTests/SCALE23RMATRMAT/btwcent144.1258024.err @@ -1,5 +1,5 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0 -Warning: Permanently added 'i136-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i160-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i160-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent1600.1256764.err b/ReleaseTests/SCALE23RMATRMAT/btwcent1600.1256764.err index ed2e1bfc..432f9f70 100644 --- a/ReleaseTests/SCALE23RMATRMAT/btwcent1600.1256764.err +++ b/ReleaseTests/SCALE23RMATRMAT/btwcent1600.1256764.err @@ -1,50 +1,50 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0 -Warning: Permanently added 'i140-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i154-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i107-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i124-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i107-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i125-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i135-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i118-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i171-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i106-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i142-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i169-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i131-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i156-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i141-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i152-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i108-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i154-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i124-102.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i107-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i125-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i135-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i118-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-103.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i142-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i169-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i131-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-308.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i156-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i141-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i152-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i108-310.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-410.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent256.1256756.err b/ReleaseTests/SCALE23RMATRMAT/btwcent256.1256756.err index 81294884..dc0f997e 100644 --- a/ReleaseTests/SCALE23RMATRMAT/btwcent256.1256756.err +++ b/ReleaseTests/SCALE23RMATRMAT/btwcent256.1256756.err @@ -1,7 +1,7 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0 -Warning: Permanently added 'i106-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i163-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i106-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i163-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent441.1256758.err b/ReleaseTests/SCALE23RMATRMAT/btwcent441.1256758.err index 4bdcb9fb..c42de063 100644 --- a/ReleaseTests/SCALE23RMATRMAT/btwcent441.1256758.err +++ b/ReleaseTests/SCALE23RMATRMAT/btwcent441.1256758.err @@ -1,17 +1,17 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0 -Warning: Permanently added 'i171-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i148-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i133-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i109-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i180-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i164-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i176-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i151-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i111-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i146-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i140-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-311.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-412.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i148-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i133-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i109-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i180-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i164-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i176-212.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i111-302.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i146-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i140-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent625.1258049.err b/ReleaseTests/SCALE23RMATRMAT/btwcent625.1258049.err index e200e72c..ec5cd52a 100644 --- a/ReleaseTests/SCALE23RMATRMAT/btwcent625.1258049.err +++ b/ReleaseTests/SCALE23RMATRMAT/btwcent625.1258049.err @@ -1,14 +1,14 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0 -Warning: Permanently added 'i162-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i130-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i144-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i170-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i165-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i114-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i135-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i126-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i156-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i142-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i139-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i162-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-201.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i130-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i144-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i170-105.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i165-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i114-309.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i135-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i126-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i156-106.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i142-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent64.1256754.err b/ReleaseTests/SCALE23RMATRMAT/btwcent64.1256754.err index 9fa2573f..6df771ed 100644 --- a/ReleaseTests/SCALE23RMATRMAT/btwcent64.1256754.err +++ b/ReleaseTests/SCALE23RMATRMAT/btwcent64.1256754.err @@ -1,3 +1,3 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0 -Warning: Permanently added 'i151-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i108-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i151-403.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i108-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE23RMATRMAT/btwcent848.1256760.err b/ReleaseTests/SCALE23RMATRMAT/btwcent848.1256760.err index 31325eb2..783078c1 100644 --- a/ReleaseTests/SCALE23RMATRMAT/btwcent848.1256760.err +++ b/ReleaseTests/SCALE23RMATRMAT/btwcent848.1256760.err @@ -1,22 +1,22 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE23-RMAT/input1_0 /work/00919/tg459476/SCALE23-RMAT/input2_0 -Warning: Permanently added 'i139-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i152-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i155-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i177-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i149-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i181-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i171-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i135-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i121-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i117-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i149-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i105-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i174-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i142-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i127-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i168-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i137-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i160-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i139-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i152-408.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i155-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i177-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i149-108.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i181-209.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i171-301.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i135-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i121-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i117-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i149-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i105-211.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i174-207.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-411.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-401.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i142-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i127-402.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i168-112.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i137-107.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i160-304.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. diff --git a/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269002.err b/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269002.err index 787cbd02..21398f2b 100644 --- a/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269002.err +++ b/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269002.err @@ -1,23 +1,23 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE26-RMAT/rmat26.txt /work/00919/tg459476/SCALE26-RMAT/fringe_scale26_rect8192_sparse1000 -Warning: Permanently added 'i174-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i152-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i166-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i161-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i130-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i120-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i150-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i132-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i130-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i118-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i136-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i145-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i172-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i167-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i158-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i103-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i122-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i174-109.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i152-111.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i166-204.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i161-208.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i130-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i120-202.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i150-407.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i132-210.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i130-306.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-205.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i118-303.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i136-101.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i145-307.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i172-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i167-104.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i158-305.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i103-405.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-406.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. [i138-410:31041] *** Process received signal *** [i138-410:31041] Signal: Segmentation fault (11) [i138-410:31041] Signal code: Address not mapped (1) diff --git a/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269004.err b/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269004.err index 9fdf8bb8..ca39c7f1 100644 --- a/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269004.err +++ b/ReleaseTests/SCALE26RECT8192/failed/rect4096.1269004.err @@ -1,6 +1,6 @@ + ibrun ./MultTime /work/00919/tg459476/SCALE26-RMAT/rmat26.txt /work/00919/tg459476/SCALE26-RMAT/fringe_scale26_rect8192_sparse100000 -Warning: Permanently added 'i122-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. -Warning: Permanently added 'i178-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i122-312.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. +Warning: Permanently added 'i178-409.ranger.tacc.utexas.edu' (RSA) to the list of known hosts. [i162-410:02581] *** Process received signal *** [i162-410:02581] Signal: Bus error (7) [i162-410:02581] Signal code: (2) diff --git a/compile_commands.json b/compile_commands.json new file mode 120000 index 00000000..572ddf1d --- /dev/null +++ b/compile_commands.json @@ -0,0 +1 @@ +/jet/home/tmcfarla/CombBLAS/_build/compile_commands.json \ No newline at end of file diff --git a/include/CombBLAS/CombBLAS.h b/include/CombBLAS/CombBLAS.h index 94bbb6c2..fc93cf07 100644 --- a/include/CombBLAS/CombBLAS.h +++ b/include/CombBLAS/CombBLAS.h @@ -59,9 +59,9 @@ NOTICE. This Software was developed under funding from the U.S. Department of E // Just in case the -fopenmp didn't define _OPENMP by itself #ifdef THREADED - #ifndef _OPENMP - #define _OPENMP - #endif + //#ifndef _OPENMP + //#define _OPENMP + //#endif #endif #ifdef _OPENMP diff --git a/include/CombBLAS/ParFriends.h b/include/CombBLAS/ParFriends.h index a79c714c..279025eb 100644 --- a/include/CombBLAS/ParFriends.h +++ b/include/CombBLAS/ParFriends.h @@ -41,8 +41,22 @@ #include "OptBuf.h" #include "mtSpGEMM.h" #include "MultiwayMerge.h" +#include #include #include +#include + +#ifdef __CUDACC__ +#include +#include "cudaSpGEMM.h" +#include "../GALATIC/include/dCSR.cuh" +#include "../GALATIC/include/CSR.cuh" +#include "../GALATIC/include/SemiRingInterface.h" +#include "../GALATIC/include/TestSpGEMM.cuh" +#include "../GALATIC/source/device/Multiply.cuh" +#endif +//#include "cudaSpGEMM.cu" + namespace combblas { @@ -1253,7 +1267,8 @@ SpParMat Mult_AnXBn_DoubleBuff int stages, dummy; // last two parameters of ProductGrid are ignored for Synch multiplication std::shared_ptr GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy); - LIA C_m = A.spSeq->getnrow(); + + LIA C_m = A.spSeq->getnrow(); LIB C_n = B.spSeq->getncol(); UDERA * A1seq = new UDERA(); @@ -1282,8 +1297,11 @@ SpParMat Mult_AnXBn_DoubleBuff int Aself = (A.commGrid)->GetRankInProcRow(); int Bself = (B.commGrid)->GetRankInProcCol(); + double mpi_overhead = 0.0; + for(int i = 0; i < stages; ++i) { + std::vector ess; if(i == Aself) { @@ -1350,6 +1368,7 @@ SpParMat Mult_AnXBn_DoubleBuff // Start the second round for(int i = 0; i < stages; ++i) { + std::vector ess; if(i == Aself) { @@ -1443,6 +1462,588 @@ SpParMat Mult_AnXBn_DoubleBuff return SpParMat (C, GridC); // return the result object } +#ifdef __CUDACC__ +template +struct Wrap_SR : SemiRing +{ + __host__ __device__ NT3 multiply(const NT1 &a, const NT2 &b) const { return sr::multiply(a, b); } + __host__ __device__ NT3 add(const NT1 &a, const NT2 &b) const { return sr::add(a, b); } + __host__ __device__ static double AdditiveIdentity() { return 0; } +}; + + + +template +void convertCSR(UDERA *ARecv, dCSR &input_GPU, int id) +{ + typedef typename UDERA::LocalIT LIA; + LIA j = 0; + unsigned int *rows; + cudaMallocHost(&rows, sizeof(unsigned int) * (ARecv->getncol() + 1)); + HANDLE_ERROR(cudaGetLastError()); + + for (LIA i = 0; i <= ARecv->getnzc(); ++i) + { + if (i == ARecv->getnzc()) + { + while (j <= ARecv->getncol()) + { + rows[j] = ARecv->getnnz(); + j++; + } + break; + } + unsigned int val = (unsigned int) ARecv->GetDCSC()->cp[i]; + while (j <= ARecv->GetDCSC()->jc[i] && j <= ARecv->getncol()) + { + rows[j] = val; + j++; + } + } + HANDLE_ERROR(cudaGetLastError()); + + //std::cout << "STARTING ALLOCING in CONV " << id << std::endl; + if(input_GPU.nnz != 0) dealloc(input_GPU); + input_GPU.rows = ARecv->getncol(); + input_GPU.cols = ARecv->getnrow(); + input_GPU.nnz = ARecv->getnnz(); + HANDLE_ERROR(cudaGetLastError()); + + // std::cout << input_GPU.nnz << std::endl; + gpuErrchk(cudaMalloc(&input_GPU.data, sizeof(NU1) * (ARecv->getnnz()))); + gpuErrchk(cudaMalloc(&input_GPU.col_ids, sizeof(unsigned int) * (ARecv->getnnz()))); + gpuErrchk(cudaMalloc(&input_GPU.row_offsets, sizeof(unsigned int) * (ARecv->getncol() + 1))); + gpuErrchk(cudaDeviceSynchronize()); + // std::cout << "STARTING COPY " << id << std::endl; + + cudaMemcpy(input_GPU.row_offsets, rows, (input_GPU.rows + 1) * sizeof(unsigned int), cudaMemcpyHostToDevice); + + gpuErrchk(cudaDeviceSynchronize()); + // std::cout << "CPED ROW/COLS " << id << std::endl; + if(ARecv->getnnz() > 0) gpuErrchk(cudaMemcpy(input_GPU.data, ARecv->GetDCSC()->numx, (ARecv->getnnz()) * sizeof(NU1), cudaMemcpyHostToDevice)); + gpuErrchk(cudaDeviceSynchronize()); + // std::cout << "CPED NUM " << id << std::endl; + if(ARecv->getnnz() > 0) gpuErrchk(cudaMemcpy(input_GPU.col_ids, &(ARecv->GetDCSC()->ir[0]), (ARecv->getnnz()) * sizeof(unsigned int), cudaMemcpyHostToDevice)); + gpuErrchk(cudaDeviceSynchronize()); + // std::cout << "DELETING ROWS " << id << std::endl; + + cudaFreeHost(rows); + gpuErrchk(cudaDeviceSynchronize()); + HANDLE_ERROR(cudaGetLastError()); + + // free(rows); + +} + +// Workaround for now + + + + +struct MinPlusSRingGPU : SemiRing { + __host__ __device__ double multiply(const double& a, const double& b) const { if(a == std::numeric_limits::max() || b == std::numeric_limits::max()) { return std::numeric_limits::max();} else return a + b; } + __host__ __device__ double add(const double& a, const double& b) const { return std::min(a, b); } + __host__ __device__ static double AdditiveIdentity() { return std::numeric_limits::max(); } +}; + +typedef Arith_SR ringss; +Arith_SR sr; +double comptime = 0; +template +CSR GPULocalMultiply(dCSR& A, dCSR& B) +{ + + double t1 = MPI_Wtime(); + const int Threads = 128; + const int BlocksPerMP = 1; + const int NNZPerThread = 2; + const int InputElementsPerThreads = 2; + const int RetainElementsPerThreads = 1; + const int MaxChunksToMerge = 16; + const int MaxChunksGeneralizedMerge = 512; // MAX: 865 + const int MergePathOptions = 8; + HANDLE_ERROR(cudaGetLastError()); + + cudaDeviceSynchronize(); + SR semiring2; + if(A.nnz == 0 || B.nnz == 0) { + CSR C; + C.alloc(A.rows, B.rows, 0); + return C; + } + dCSR result_mat_GPU; + GPUMatrixMatrixMultiplyTraits DefaultTraits( + Threads, BlocksPerMP, NNZPerThread, InputElementsPerThreads, + RetainElementsPerThreads, MaxChunksToMerge, + MaxChunksGeneralizedMerge, MergePathOptions); + + const bool Debug_Mode = false; + // DefaultTraits.preferLoadBalancing = false; + ExecutionStats stats; + // stats.measure_all = false; + HANDLE_ERROR(cudaGetLastError()); + + + //std::cout << "ENTERED MULT" << std::endl; + ACSpGEMM::Multiply( + A, B, result_mat_GPU, + DefaultTraits, stats, Debug_Mode, sr); + //std::cout << "EXITED MULT" << std::endl; + + + gpuErrchk(cudaDeviceSynchronize()); + HANDLE_ERROR(cudaGetLastError()); + // std::cout << "DONE" << std::endl; + CSR result_mat_CPU; + size_t it = 0; + // std::unordered_set nnzc_set; + // std::cout << result_mat_GPU.rows << std::endl; + convert(result_mat_CPU, result_mat_GPU); + //::cout << sizeof(NUO) * result_mat_GPU.nnz << std::endl; + //std::cout << sizeof(uint) * result_mat_GPU.rows << std::endl; + HANDLE_ERROR(cudaGetLastError()); + cudaDeviceSynchronize(); + //cudaFree(result_mat_GPU.data); + //cudaFree(result_mat_GPU.col_ids); + //cudaFree(result_mat_GPU.row_offsets); + HANDLE_ERROR(cudaGetLastError()); + //result_mat_GPU.reset(); + cudaDeviceSynchronize(); + double t2 = MPI_Wtime(); + comptime += (t2 - t1); + HANDLE_ERROR(cudaGetLastError()); + return result_mat_CPU; +} + + +int GPUTradeoff = 1024 * 1024; +/** + * Parallel C = A*B routine that uses a double buffered broadcasting scheme, but + * this time with CUDA + * @pre { Input matrices, A and B, should not alias } + * Most memory efficient version available. Total stages: 2*sqrt(p) + * Memory requirement during first sqrt(p) stages: <= (3/2)*(nnz(A)+nnz(B))+(1/2)*nnz(C) + * Memory requirement during second sqrt(p) stages: <= nnz(A)+nnz(B)+nnz(C) + * Final memory requirement: nnz(C) if clearA and clearB are true + **/ +double checkingTime = 0; +template +SpParMat Mult_AnXBn_DoubleBuff_CUDA(SpParMat &A, + SpParMat &B, + bool clearA = false, + bool clearB = false) + +{ + HANDLE_ERROR(cudaGetLastError()); + + if (!CheckSpGEMMCompliance(A, B)) + { + return SpParMat(); + } + typedef typename UDERA::LocalIT LIA; + typedef typename UDERB::LocalIT LIB; + typedef typename UDERO::LocalIT LIC; + + double over = 0; + double t1 = MPI_Wtime(); + static_assert( + std::is_same::value, + "local index types for both input matrices should be the same"); + static_assert( + std::is_same::value, + "local index types for input and output matrices should be the same"); + + int stages, dummy; // last two parameters of ProductGrid are ignored for + // Synch multiplication + int id; + MPI_Comm_rank(MPI_COMM_WORLD, &id); + ACSpGEMM::id = id; + int devices; + HANDLE_ERROR(cudaGetLastError()); + + cudaGetDeviceCount(&devices); + int local_rank, local_size; + //MPI_Comm local_comm; + //MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, id, MPI_INFO_NULL, &local_comm); + //MPI_Comm_size(local_comm, &local_size); + //MPI_Comm_rank(local_comm, &local_rank); + cudaSetDevice(id % devices); + int devs; + + //cudaGetDeviceCount(&devs); + //cudaSetDevice(id % devs); // Prevents crashes where processes > # devs + std::shared_ptr GridC = ProductGrid( + (A.commGrid).get(), (B.commGrid).get(), stages, dummy, dummy); + LIA C_m = A.spSeq->getnrow(); + LIB C_n = B.spSeq->getncol(); + + UDERA *A1seq = new UDERA(); + UDERA *A2seq = new UDERA(); + UDERB *B1seq = new UDERA(); + UDERB *B2seq = new UDERB(); + int Aself = (A.commGrid)->GetRankInProcRow(); + int Bself = (B.commGrid)->GetRankInProcCol(); + + checkingTime += MPI_Wtime() - t1; + + (A.spSeq)->Split(*A1seq, *A2seq); + const_cast(B.spSeq)->Transpose(); + (B.spSeq)->Split(*B1seq, *B2seq); + HANDLE_ERROR(cudaGetLastError()); + + // std::cout << Aself << " " << Bself << " starting GPU" << std::endl; + dCSR input_A_GPU; + dCSR input_B_GPU; + Wrap_SR semiring; + HANDLE_ERROR(cudaGetLastError()); + + // std::cout << Aself << " " << Bself << " ending cpus" << std::endl; + + gpuErrchk(cudaDeviceSynchronize()); + + // Transpose back for the column-by-column algorithm + const_cast(B1seq)->Transpose(); + const_cast(B2seq)->Transpose(); + LIA **ARecvSizes = SpHelper::allocate2D(UDERA::esscount, stages); + LIB **BRecvSizes = SpHelper::allocate2D(UDERB::esscount, stages); + + SpParHelper::GetSetSizes(*A1seq, ARecvSizes, (A.commGrid)->GetRowWorld()); + SpParHelper::GetSetSizes(*B1seq, BRecvSizes, (B.commGrid)->GetColWorld()); + + // Remotely fetched matrices are stored as pointers + UDERA *ARecv; + UDERB *BRecv; + std::vector *> tomerge; + HANDLE_ERROR(cudaGetLastError()); + + + HANDLE_ERROR(cudaGetLastError()); + + over += MPI_Wtime() - t1; + + double mpi_overhead = 0.0; + + for (int i = 0; i < stages; ++i) + { + HANDLE_ERROR(cudaGetLastError()); + double t2 = MPI_Wtime(); + dCSR input_A_recv_GPU; + dCSR input_B_recv_GPU; + std::vector ess; + if (i == Aself) + { + convertCSR(A1seq, input_A_recv_GPU, id); + + } + else + { + + ARecv = new UDERA(); // first, create the object + } + ess.resize(UDERA::esscount); + for (int j = 0; j < UDERA::esscount; ++j) + { + ess[j] = ARecvSizes[j][i]; // essentials of the ith + // matrix in this row + } + //std::cout << "STARTING BCAST " << id << std::endl; + SpParHelper::BCastMatrixCUDA(GridC->GetRowWorld(), + input_A_recv_GPU, ess, i, GPUTradeoff); // then, receive its elements + //std::cout << "ENDING BCAST " << id << std::endl; + ess.clear(); + if (i == Bself) + { + convertCSR(B1seq, input_B_recv_GPU, id); // shallow-copy + } + else + { + + BRecv = new UDERB(); + } + ess.resize(UDERB::esscount); + for (int j = 0; j < UDERB::esscount; ++j) + { + ess[j] = BRecvSizes[j][i]; + } + SpParHelper::BCastMatrixCUDA(GridC->GetColWorld(), + input_B_recv_GPU, ess, i, GPUTradeoff); // then, receive its elements + HANDLE_ERROR(cudaGetLastError()); + //std::cout << "first bcast done for " << id << std::endl; + + //if(input_B_recv_GPU.nnz == 0 || input_A_recv_GPU.nnz == 0) { + // std::cout << "ZEROOO " << id << std::endl; + // continue; + //} + // before activating this remove transposing B1seq + /* + SpTuples * C_cont = MultiplyReturnTuples + (*ARecv, *BRecv, // parameters + themselves false, true, // transpose information (B is + transposed) i != Aself, // 'delete A' condition i != + Bself); // 'delete B' condition + + */ + + // load results onto CPU. + + // double start = MPI_Wtime(); + + // std::cout << Aself << " " << Bself << " ending alloc" << + // std::endl; double start = MPI_Wtime(); double t1 = + // MPI_Wtime(); + + // std::cout << input_A_recv_GPU.rows << std::endl; + mpi_overhead += MPI_Wtime() - t2; + //std::cout << "mult on " << id << std::endl; + //MPI_Barrier(MPI_COMM_WORLD); + //MPI_Barrier(MPI_COMM_WORLD); + CSR result_mat_CPU = GPULocalMultiply(input_B_recv_GPU, input_A_recv_GPU); + + cudaDeviceSynchronize(); + HANDLE_ERROR(cudaGetLastError()); + //std::cout << "mult off" << id << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + //over += MPI_Wtime() - t1; + //std::cout << "TUPLING " << id << std::endl; + // printf("O = %i\n", C_cont->getnnz()); + // mpi_overhead += MPI_Wtime() - start; + size_t it = 0; + std::tuple *tuplesC = + static_cast *>(::operator new( + sizeof(std::tuple[result_mat_CPU.nnz]))); + for (LIC i = 0; i < result_mat_CPU.rows; ++i) + { + + for (LIC j = result_mat_CPU.row_offsets[i]; + j < result_mat_CPU.row_offsets[i + 1]; ++j) + { + // nzc_set.insert(result_mat_CPU.col_ids[j]); + // std::cout << "IT " << it << " EXCEEDED " << + // result_mat_CPU.nnz < *C_cont = new SpTuples( + result_mat_CPU.nnz, C_m, + C_n, tuplesC, false, true); + //(*C_cont).PrintInfo(); + if (i != Aself) + delete ARecv; + //dealloc(input_A_recv_GPU); + + if (i != Bself) + delete BRecv; + //dealloc(input_B_recv_GPU); + + if (!C_cont->isZero()) + tomerge.push_back(C_cont); + else + delete C_cont; + } + HANDLE_ERROR(cudaGetLastError()); + + if (clearA) + delete A1seq; + if (clearB) + delete B1seq; + + // Set the new dimensions + t1 = MPI_Wtime(); + //dealloc(input_A_GPU); + //dealloc(input_B_GPU); + cudaDeviceSynchronize(); + dCSR input_A2_GPU; + dCSR input_B2_GPU; + HANDLE_ERROR(cudaGetLastError()); + + + HANDLE_ERROR(cudaGetLastError()); + + SpParHelper::GetSetSizes(*A2seq, ARecvSizes, (A.commGrid)->GetRowWorld()); + SpParHelper::GetSetSizes(*B2seq, BRecvSizes, (B.commGrid)->GetColWorld()); + over += MPI_Wtime() - t1; + + + //std::cout << "S3 " << id << std::endl; + for (int i = 0; i < stages; ++i) + { + double t2 = MPI_Wtime(); + dCSR input_A_recv_GPU; + dCSR input_B_recv_GPU; + // std::cout << Aself << " " << Bself << " starting stage " << i + // << std::endl; + std::vector ess; + if (i == Aself) + { + convertCSR(A2seq, input_A_recv_GPU, id); + } else + { + + ARecv = new UDERA(); // first, create the object + } + ess.resize(UDERA::esscount); + for (int j = 0; j < UDERA::esscount; ++j) + { + ess[j] = ARecvSizes[j][i]; // essentials of the ith + // matrix in this row + } + //std::cout << "STARTING BCAST " << id << std::endl; + SpParHelper::BCastMatrixCUDA(GridC->GetRowWorld(), + input_A_recv_GPU, ess, i, GPUTradeoff); // then, receive its elements + //std::cout << "ENDING BCAST " << id << std::endl; + ess.clear(); + if (i == Bself) + { + convertCSR(B2seq, input_B_recv_GPU, id); } + else + { + + BRecv = new UDERB(); + } + ess.resize(UDERB::esscount); + for (int j = 0; j < UDERB::esscount; ++j) + { + ess[j] = BRecvSizes[j][i]; + } + SpParHelper::BCastMatrixCUDA(GridC->GetColWorld(), + input_B_recv_GPU, ess, i, GPUTradeoff); // then, receive its elements + + // before activating this remove transposing B1seq + /* + SpTuples * C_cont = MultiplyReturnTuples + (*ARecv, *BRecv, // parameters + themselves false, true, // transpose information (B is + transposed) i != Aself, // 'delete A' condition i != + Bself); // 'delete B' condition + + */ + /* ARecv->Transpose(); + BRecv->Transpose(); + SpTuples * C_cont = LocalHybridSpGEMM + (*ARecv, *BRecv, // parameters themselves + i != Aself, // 'delete A' condition + i != Bself); // 'delete B' condition*/ + // const_cast< UDERB* >(B.spSeq)->Transpose(); + HANDLE_ERROR(cudaGetLastError()); + + mpi_overhead += MPI_Wtime() - t2; + CSR result_mat_CPU = GPULocalMultiply(input_B_recv_GPU, input_A_recv_GPU); + gpuErrchk(cudaDeviceSynchronize()); + HANDLE_ERROR(cudaGetLastError()); + + //over += MPI_Wtime() - t1; + //std::cout << over << std::endl; + // std::cout << "ENDING MULT" << std::endl; + // mpi_overhead += MPI_Wtime() - start; + // double t2 = MPI_Wtime(); + // printf("Time for actual mult = %.6lf \n", t2 - t1); + size_t it = 0; + // std::unordered_set nnzc_set; + // std::cout << result_mat_GPU.nnz << std::endl; + // std::cout << Aself << " " << Bself << " ending GPU " << i << + // std::endl; + // printf("OC = %i\n", result_mat_CPU.nnz); + + std::tuple *tuplesC = + static_cast *>(::operator new( + sizeof(std::tuple[result_mat_CPU.nnz]))); + for (LIC i = 0; i < result_mat_CPU.rows; ++i) + { + for (LIC j = result_mat_CPU.row_offsets[i]; + j < result_mat_CPU.row_offsets[i + 1]; ++j) + { + // nzc_set.insert(result_mat_CPU.col_ids[j]); + // std::cout << "IT " << it << " EXCEEDED " << + // result_mat_CPU.nnz < *C_cont = new SpTuples( + result_mat_CPU.nnz, C_m, + C_n, tuplesC, false, true); + //(*C_cont).PrintInfo(); + if (i != Aself) + delete ARecv; + //dealloc(input_A_recv_GPU); + + + if (i != Bself) + delete BRecv; + //dealloc(input_B_recv_GPU); + + if (!C_cont->isZero()) + tomerge.push_back(C_cont); + else + delete C_cont; + } + t1 = MPI_Wtime(); + //dealloc(input_A2_GPU); + //dealloc(input_B2_GPU); + SpHelper::deallocate2D(ARecvSizes, UDERA::esscount); + SpHelper::deallocate2D(BRecvSizes, UDERB::esscount); + // A2seq->Transpose(); + // B2seq->Transpose(); + if (clearA) + { + delete A2seq; + delete A.spSeq; + A.spSeq = NULL; + } + else + { + // A1seq->Transpose(); + // A2seq->Transpose(); + (A.spSeq)->Merge(*A1seq, *A2seq); + delete A1seq; + delete A2seq; + } + if (clearB) + { + delete B2seq; + delete B.spSeq; + B.spSeq = NULL; + } + else + { + B1seq->Transpose(); + B2seq->Transpose(); + (B.spSeq)->Merge(*B1seq, *B2seq); + delete B1seq; + delete B2seq; + const_cast(B.spSeq) + ->Transpose(); // transpose back to original + } + //checkingTime += MPI_Wtime() - t1; + // printf("%.6lf\n", mpi_overhead); + UDERO *C = new UDERO(MergeAll(tomerge, C_m, C_n, true), false); + // printf("Full output has rows = %i, cols = %i, nnz = %i\n", C->getnrow(), + // C->getncol(), C->getnnz()); + cudaDeviceSynchronize(); + HANDLE_ERROR(cudaGetLastError()); + + over += MPI_Wtime() - t1; + //std::cout << over << "\n"; + return SpParMat( + C, GridC); // return the result object // return the result object + HANDLE_ERROR(cudaGetLastError()); + +} + +#endif + /** * Parallel A = B*C routine that uses only MPI-1 features * Relies on simple blocking broadcast diff --git a/include/CombBLAS/SequenceHeaps/util.h b/include/CombBLAS/SequenceHeaps/util.h index 1d4b41aa..e9c146c7 100644 --- a/include/CombBLAS/SequenceHeaps/util.h +++ b/include/CombBLAS/SequenceHeaps/util.h @@ -68,11 +68,11 @@ ////////////// min, max etc. ////////////////////////////////////// #ifndef Max -#define Max(x,y) ((x)>=(y)?(x):(y)) +//#define Max(x,y) ((x)>=(y)?(x):(y)) #endif #ifndef Min -#define Min(x,y) ((x)<=(y)?(x):(y)) +//#define Min(x,y) ((x)<=(y)?(x):(y)) #endif #ifndef Abs diff --git a/include/CombBLAS/SpDefs.h b/include/CombBLAS/SpDefs.h index 23c3941f..050d9666 100644 --- a/include/CombBLAS/SpDefs.h +++ b/include/CombBLAS/SpDefs.h @@ -118,8 +118,8 @@ Row // force 8-bytes alignment in heap allocated memory -#ifndef ALIGN -#define ALIGN 8 +#ifndef ALIGNX +#define ALIGNX 8 #endif #ifndef THRESHOLD diff --git a/include/CombBLAS/SpImpl.h b/include/CombBLAS/SpImpl.h index bd0bda94..0c1fe909 100644 --- a/include/CombBLAS/SpImpl.h +++ b/include/CombBLAS/SpImpl.h @@ -201,4 +201,4 @@ struct SpImpl // specialization #include "SpImpl.cpp" -#endif \ No newline at end of file +#endif diff --git a/include/CombBLAS/SpParHelper.cpp b/include/CombBLAS/SpParHelper.cpp index f2198294..61ca0cbd 100644 --- a/include/CombBLAS/SpParHelper.cpp +++ b/include/CombBLAS/SpParHelper.cpp @@ -600,6 +600,101 @@ void SpParHelper::BCastMatrix(MPI_Comm & comm1d, SpMat & Matrix, cons } } +/** + * @param[in] Matrix {For the root processor, the local object to be sent to all others. + * For all others, it is a (yet) empty object to be filled by the received data} + * @param[in] essentials {irrelevant for the root} + **/ + +#ifdef __CUDACC__ + +double commtime = 0; +int comms = 0; +int datahits = 0; + +int rowshits = 0; + +int colhits = 0; + + +template +void SpParHelper::BCastMatrixCUDA(MPI_Comm & comm1d, dCSR & Matrix, const std::vector & essentials, int root, int GPUTradeoff) +{ + comms += 1; + double t1 = MPI_Wtime(); + cudaDeviceSynchronize(); + int myrank; + MPI_Comm_rank(comm1d, &myrank); + if(myrank != root) + { + Matrix.alloc(essentials[2],essentials[1],essentials[0],true); + } + + //if(sizeof(uint)*(Matrix.nnz) <= 32000) std::cout << "UNDER COLS" << std::endl; + //if(sizeof(NT)*(Matrix.nnz) <= 32000) std::cout << "UNDER DATA" << std::endl; + //std::cout << myrank << " " << Matrix.rows << " " << Matrix.cols << " " << Matrix.nnz << std::endl; + cudaDeviceSynchronize(); + //std::cout << myrank << " BCASTING FIRST FROM " << root << std::endl; + //if(!essentials[0]) return; + //size_t free; + //size_t total; + //cudaMemGetInfo(&free, &total); + //std::cout << myrank << " has " << free << " of " << total << std::endl; + //int GPUTradeoff = 1024 * 1024; + //std::cout << GPUTradeoff << std::endl; + if(sizeof(uint)*(Matrix.rows + 1) >= GPUTradeoff) { + rowshits += 1; + MPI_Bcast(Matrix.row_offsets, Matrix.rows + 1, MPIType(), root, comm1d); + } else { + uint* temp = (uint*) malloc(sizeof(uint)*(Matrix.rows + 1)); + if(myrank == root) cudaMemcpy(temp, Matrix.row_offsets, (Matrix.rows + 1)*sizeof(uint), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + MPI_Bcast(temp, Matrix.rows + 1, MPIType(), root, comm1d); + cudaDeviceSynchronize(); + if(myrank != root) cudaMemcpy(Matrix.row_offsets, temp, (Matrix.rows + 1)*sizeof(uint), cudaMemcpyHostToDevice); + free(temp); + } + + cudaDeviceSynchronize(); + //std::cout << myrank << " BCASTING SECOND" << std::endl; + if(sizeof(uint)*(Matrix.nnz) >= GPUTradeoff) { + colhits += 1; + MPI_Bcast(Matrix.col_ids, Matrix.nnz, MPIType(), root, comm1d); + } else { + //std::cout << "ACTIVATED WOOHOO" << std::endl; + uint* temp = (uint*) malloc(sizeof(uint)*Matrix.nnz); + if(myrank == root) cudaMemcpy(temp, Matrix.col_ids, Matrix.nnz*sizeof(uint), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + MPI_Bcast(temp, Matrix.nnz, MPIType(), root, comm1d); + cudaDeviceSynchronize(); + if(myrank != root) cudaMemcpy(Matrix.col_ids, temp, Matrix.nnz*sizeof(uint), cudaMemcpyHostToDevice); + free(temp); + //MPI_Bcast(Matrix.col_ids, Matrix.nnz, MPIType(), root, comm1d); + } + + cudaDeviceSynchronize(); + //std::cout << "BCASTING 2 " << myrank << std::endl; + if(sizeof(NT)*(Matrix.nnz) >= GPUTradeoff) { + datahits += 1; + MPI_Bcast(Matrix.data, Matrix.nnz, MPIType(), root, comm1d); + } else { + //std::cout << "WE ARE ON" << std::endl; + NT* temp = (NT*) malloc(sizeof(NT)*Matrix.nnz); + if(myrank == root) cudaMemcpy(temp, Matrix.data, Matrix.nnz*sizeof(NT), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + MPI_Bcast(temp, Matrix.nnz, MPIType(), root, comm1d); + cudaDeviceSynchronize(); + if(myrank != root) cudaMemcpy(Matrix.data, temp, Matrix.nnz*sizeof(NT), cudaMemcpyHostToDevice); + free(temp); + } + + cudaDeviceSynchronize(); + //std::cout << "BCAST DONE " << myrank << std::endl; + commtime += MPI_Wtime() - t1; +} + +#endif + /** * @param[in] Matrix {For the root processor, the local object to be sent to all others. * For all others, it is a (yet) empty object to be filled by the received data} diff --git a/include/CombBLAS/SpParHelper.h b/include/CombBLAS/SpParHelper.h index 840a09f4..e43cd30b 100644 --- a/include/CombBLAS/SpParHelper.h +++ b/include/CombBLAS/SpParHelper.h @@ -42,6 +42,7 @@ #include "MPIType.h" #include "SpDefs.h" #include "psort/psort.h" +#include "../GALATIC/include/dCSR.cuh" namespace combblas { @@ -80,6 +81,11 @@ class SpParHelper template static void BCastMatrix(MPI_Comm & comm1d, SpMat & Matrix, const std::vector & essentials, int root); +#ifdef __CUDACC__ + template + static void BCastMatrixCUDA(MPI_Comm & comm1d, dCSR & Matrix, const std::vector & essentials, int root, int GPUTradeoff=1024*1024); +#endif + template static void IBCastMatrix(MPI_Comm & comm1d, SpMat & Matrix, const std::vector & essentials, int root, std::vector & indarrayReq , std::vector & numarrayReq); diff --git a/include/CombBLAS/SpParMat.cpp b/include/CombBLAS/SpParMat.cpp index 70b213a4..bd418921 100644 --- a/include/CombBLAS/SpParMat.cpp +++ b/include/CombBLAS/SpParMat.cpp @@ -92,7 +92,7 @@ SpParMat< IT,NT,DER >::SpParMat () assert( (sizeof(IT) >= sizeof(typename DER::LocalIT)) ); spSeq = new DER(); commGrid.reset(new CommGrid(MPI_COMM_WORLD, 0, 0)); -} +} /** * If there is a single file read by the master process only, use this and then call ReadDistribute() diff --git a/include/CombBLAS/SpParMat.h b/include/CombBLAS/SpParMat.h index 1d0880b8..62cf179d 100644 --- a/include/CombBLAS/SpParMat.h +++ b/include/CombBLAS/SpParMat.h @@ -323,6 +323,10 @@ class SpParMat template friend SpParMat Mult_AnXBn_DoubleBuff (SpParMat & A, SpParMat & B, bool clearA, bool clearB); + + template + friend SpParMat + Mult_AnXBn_DoubleBuff_CUDA (SpParMat & A, SpParMat & B, bool clearA, bool clearB); template friend SpParMat diff --git a/include/CombBLAS/cudaSpGEMM.cu b/include/CombBLAS/cudaSpGEMM.cu new file mode 100644 index 00000000..267651da --- /dev/null +++ b/include/CombBLAS/cudaSpGEMM.cu @@ -0,0 +1,147 @@ + + +#include "cudaSpGEMM.h" +#include +#include +#include +#include +#include +#include +#include "../GALATIC/include/CSR.cuh" +#include "../GALATIC/include/dCSR.cuh" + +//#include "../GALATIC/source/device/Multiply.cuh" + +template +__global__ void transformColumn_d(IT A_nzc, IT* A_Tran_CP, + IT* A_Tran_IR, + IT* A_Tran_JC, + NT1* A_Tran_numx, + IT* B_CP, + IT* B_IR, + IT* B_JC, + NT2* B_numx, + std::tuple * tuplesC, IT* curptrC, IT B_nzc) { + for(size_t i = blockIdx.x; i < B_nzc; i += gridDim.x) { + size_t nnzcolB = B_CP[i+1] - B_CP[i]; + //if(j == 0) printf("BlockDim = %i, GridDim = %i", blockDim.x, gridDim.x); + for(size_t j = threadIdx.x; j < A_nzc; j += blockDim.x) { + bool made = false; + size_t r = A_Tran_CP[j]; + uint ptr = curptrC[i]; + for (size_t k = 0; k < nnzcolB; ++k) { + + while (r < A_Tran_CP[j + 1] && B_IR[B_CP[i]+k] > A_Tran_IR[r]) { + r++; + } + if (r >= A_Tran_CP[j + 1]) { + break; + } + if (B_IR[B_CP[i]+k] == A_Tran_IR[r]) { + NTO mrhs = A_Tran_numx[r] * B_numx[B_CP[i]+k]; + if(true) { + if (made) { + std::get<2>(tuplesC[ptr]) = std::get<2>(tuplesC[ptr]) + mrhs; + } else { + made = true; + ptr = atomicAdd((unsigned long long*) &curptrC[i],(unsigned long long) 1); + //if (colptr_size_d[i] != ptr - curptrC[i]) printf("Potential conflict\n"); + //__syncthreads(); + //printf("Adding at ptr = %i\n", (int) ptr); + // colptr_size_d[i]++; + std::get<0>(tuplesC[ptr]) = A_Tran_JC[j]; + //if (A_Tran_JC[j] < 0 || B_JC[i] < 0) { + // printf("Somehow got a <0, %i, %i", (int) A_Tran_JC[j], (int) B_JC[i]); + //} + std::get<1>(tuplesC[ptr])= B_JC[i]; + std::get<2>(tuplesC[ptr]) = mrhs; + } + } + } + } + } + } +} +template < typename NTO, typename IT, typename NT1, typename NT2> +void transformColumn(IT A_nzc, IT* A_Tran_CP, + IT* A_Tran_IR, + IT* A_Tran_JC, + NT1* A_Tran_numx, + IT* B_CP, + IT* B_IR, + IT* B_JC, + NT2* B_numx, + std::tuple * tuplesC_d, IT* curptrC, IT B_nzc) { + int blks = std::min(65535,(int) B_nzc); + transformColumn_d<<>>(A_nzc, A_Tran_CP, + A_Tran_IR, + A_Tran_JC, + A_Tran_numx, + B_CP, +B_IR, + B_JC, + B_numx, + tuplesC_d, curptrC, B_nzc); +} + +template void transformColumn< double, int64_t, double, double>( + int64_t A_nzc, int64_t* A_Tran_CP, + int64_t* A_Tran_IR, + int64_t* A_Tran_JC, + double* A_Tran_numx, + int64_t* B_CP, + int64_t* B_IR, + int64_t* B_JC, + double* B_numx, + std::tuple * tuplesC_d, int64_t* curptrC, int64_t B_nzc); + +template +__host__ CSR LocalGalaticSPGEMM +(CSR input_A_CPU, +CSR input_B_CPU, + bool clearA, bool clearB, Arith_SR semiring, IT * aux = nullptr) { + /* dCSR input_A_GPU; +dCSR input_B_GPU; + +dCSR result_mat_GPU; +convert(input_A_GPU, input_A_CPU); +convert(input_B_GPU, input_B_CPU); + +// load data into semiring struct. For this one, we don't need to do anything, +// but you still need to pass it in for generality. The cost is trivial. + + +// Setup execution options, we'll skip the details for now. + +const int Threads = 256; +const int BlocksPerMP = 1; +const int NNZPerThread = 2; +const int InputElementsPerThreads = 2; +const int RetainElementsPerThreads = 1; +const int MaxChunksToMerge = 16; +const int MaxChunksGeneralizedMerge = 256; // MAX: 865 +const int MergePathOptions = 8; + + +GPUMatrixMatrixMultiplyTraits DefaultTraits(Threads, BlocksPerMP, NNZPerThread, + InputElementsPerThreads, RetainElementsPerThreads, + MaxChunksToMerge, MaxChunksGeneralizedMerge, MergePathOptions); + +const bool Debug_Mode = true; +DefaultTraits.preferLoadBalancing = true; +ExecutionStats stats; +stats.measure_all = false; + +// Actually perform the matrix multiplicaiton +//ACSpGEMM::Multiply(input_A_GPU, input_B_GPU, result_mat_GPU, DefaultTraits, stats, Debug_Mode, semiring); + +CSR result_mat_CPU; +// load results onto CPU. +convert(result_mat_CPU, result_mat_GPU); +return result_mat_CPU;*/ + } + +template CSR LocalGalaticSPGEMM +(CSR input_A_CPU, +CSR input_B_CPU, + bool clearA, bool clearB, Arith_SR semiring, int64_t * aux = nullptr); diff --git a/include/CombBLAS/cudaSpGEMM.h b/include/CombBLAS/cudaSpGEMM.h new file mode 100644 index 00000000..af2fa164 --- /dev/null +++ b/include/CombBLAS/cudaSpGEMM.h @@ -0,0 +1,31 @@ +#ifndef _cudaSpGEMM_h +#define _cudaSpGEMM_h + +#include "../GALATIC/include/CSR.h" +#include "../GALATIC/include/CSR.cuh" +#include "../GALATIC/include/SemiRingInterface.h" +#include + +struct Arith_SR : SemiRing +{ + __host__ __device__ double multiply(const double& a, const double& b) const { return a * b; } + __host__ __device__ double add(const double& a, const double& b) const { return a + b; } + __host__ __device__ static double AdditiveIdentity() { return 0; } +}; + template < typename NTO, typename IT, typename NT1, typename NT2> +void transformColumn(IT A_nzc, IT* A_Tran_CP, + IT* A_Tran_IR, + IT* A_Tran_JC, + NT1* A_Tran_numx, + IT* B_CP, + IT* B_IR, + IT* B_JC, + NT2* B_numx, + std::tuple * tuplesC_d, IT* curptrC, IT B_nzc); + +template +CSR LocalGalaticSPGEMM +(CSR input_A_CPU, +CSR input_B_CPU, + bool clearA, bool clearB, Arith_SR semiring, IT * aux = nullptr); +#endif diff --git a/include/CombBLAS/mtSpGEMM.h b/include/CombBLAS/mtSpGEMM.h index 356f3b16..a708b138 100644 --- a/include/CombBLAS/mtSpGEMM.h +++ b/include/CombBLAS/mtSpGEMM.h @@ -2,7 +2,33 @@ #define _mtSpGEMM_h #include "CombBLAS.h" +#include +#ifdef GPU_ENABLED +#include +#include +#include "cudaSpGEMM.h" +#endif + +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +#define CHECK_CUSPARSE(func) \ +{ \ + cusparseStatus_t status = (func); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE API failed at line %d with error: %s (%d)\n", \ + __LINE__, cusparseGetErrorString(status), status); \ + return EXIT_FAILURE; \ + } \ +} namespace combblas { /* Multithreaded prefix sum @@ -459,6 +485,208 @@ SpTuples * LocalHybridSpGEMM return spTuplesC; } +#ifdef GPU_ENABLED + + + + template + SpTuples* LocalHybridSpGEMM_CUDA +(const SpDCCols & A, + const SpDCCols & B, + bool clearA, bool clearB, IT * aux = nullptr) +{ + + + IT mdim = A.getnrow(); + IT ndim = B.getncol(); + IT nnzA = A.getnnz(); + if(A.isZero() || B.isZero()) + { + return new SpTuples(0, mdim, ndim); + } + + + Dcsc* Adcsc = A.GetDCSC(); + Dcsc* Bdcsc = B.GetDCSC(); + IT nA = A.getncol(); + float cf = static_cast(nA+1) / static_cast(Adcsc->nzc); + IT csize = static_cast(ceil(cf)); // chunk size + bool deleteAux = false; + if(aux==nullptr) + { + deleteAux = true; + Adcsc->ConstructAux(nA, aux); + } + + int numThreads = 1; +#ifdef THREADED +#pragma omp parallel + { + numThreads = omp_get_num_threads(); + } +#endif + + IT* flopC = estimateFLOP(A, B, aux); + + + IT* colnnzC = estimateNNZ_Hash(A, B, flopC, aux); + IT* flopptr = prefixsum(flopC, Bdcsc->nzc, numThreads); + IT flop = flopptr[Bdcsc->nzc]; + IT* colptrC = prefixsum(colnnzC, Bdcsc->nzc, numThreads); + delete [] colnnzC; + delete [] flopC; + IT nnzc = colptrC[Bdcsc->nzc]; + + + std::tuple * tuplesC = static_cast *> (::operator new (sizeof(std::tuple[nnzc]))); + + std::vector>> colindsVec(numThreads); + + std::vector>> globalHashVecAll(numThreads); + std::vector>> globalHeapVecAll(numThreads); + + + + SpDCCols A_Tran = A.TransposeConst(); + SpDCCols B_Tran = B.TransposeConst(); + + Dcsc* Adcsc_Tran = A_Tran.GetDCSC(); + IT* A_Tran_CP; + IT* A_Tran_IR; + IT* A_Tran_JC; + NT1* A_Tran_numx; + IT* B_CP; + IT* B_IR; + IT* B_JC; + NT2* B_numx; + std::tuple * tuplesC_d; + IT * tuplesC_d_o; + IT * tuplesC_d_t; + NTO * tuplesC_d_th; + uint * colptr_size_d; + uint* curptr_d; + IT * colptrC_d; + cudaMalloc((void**) &curptr_d, sizeof(uint)); + cudaMalloc((void**) &tuplesC_d_o, (sizeof(IT[nnzc]))); + cudaMalloc((void**) &tuplesC_d_t, (sizeof(IT[nnzc]))); + cudaMalloc((void**) &tuplesC_d_th, (sizeof(NTO[nnzc]))); + cudaMalloc((void**) &tuplesC_d, (sizeof(std::tuple[nnzc]))); + cudaMalloc((void**) &colptr_size_d, (sizeof(uint[Bdcsc->nzc]))); + cudaMemset(colptr_size_d, 0, sizeof(uint[Bdcsc->nzc])); + cudaMalloc((void**) &A_Tran_CP, sizeof(IT[Adcsc_Tran->nzc + 1])); + cudaMalloc((void**) &A_Tran_IR, sizeof(IT[Adcsc_Tran->nz])); + cudaMalloc((void**) &A_Tran_JC, sizeof(IT[Adcsc_Tran->nzc])); + cudaMalloc((void**) &A_Tran_numx, sizeof(NT1[Adcsc_Tran->nz])); + cudaMalloc((void**) &B_CP, sizeof(IT[Bdcsc->nzc + 1])); + cudaMalloc((void**) &B_IR, sizeof(IT[Bdcsc->nz])); + cudaMalloc((void**) &B_JC, sizeof(IT[Bdcsc->nzc])); + cudaMalloc((void**) &B_numx, sizeof(NT2[Bdcsc->nz])); + cudaMalloc((void**) &colptrC_d, sizeof(IT[Bdcsc->nzc])); + cudaMemcpy(colptrC_d, colptrC, sizeof(IT[Bdcsc->nzc]), cudaMemcpyHostToDevice); + cudaMemcpy(A_Tran_CP, Adcsc_Tran->cp, sizeof(IT[Adcsc_Tran->nzc + 1]), cudaMemcpyHostToDevice); + cudaMemcpy(A_Tran_IR, Adcsc_Tran->ir, sizeof(IT[Adcsc_Tran->nz]), cudaMemcpyHostToDevice); + cudaMemcpy(A_Tran_JC, Adcsc_Tran->jc, sizeof(IT[Adcsc_Tran->nzc]), cudaMemcpyHostToDevice); + cudaMemcpy(A_Tran_numx, Adcsc_Tran->numx, sizeof(NT1[Adcsc_Tran->nz]), cudaMemcpyHostToDevice); + cudaMemcpy(B_CP, Bdcsc->cp, sizeof(IT[Bdcsc->nzc + 1]), cudaMemcpyHostToDevice); + cudaMemcpy(B_IR, Bdcsc->ir, sizeof(IT[Bdcsc->nz]), cudaMemcpyHostToDevice); + cudaMemcpy(B_JC, Bdcsc->jc, sizeof(IT[Bdcsc->nzc]), cudaMemcpyHostToDevice); + cudaMemcpy(B_numx, Bdcsc->numx, sizeof(NT1[Bdcsc->nz]), cudaMemcpyHostToDevice); +/*#ifdef THREADED +#pragma omp parallel for +#endif + for(size_t i=0; i < Bdcsc->nzc; ++i) + { + size_t nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i]; //nnz in the current column of B + int myThread = 0; + +#ifdef THREADED + myThread = omp_get_thread_num(); +#endif + uint* curptr = new uint; + *curptr = colptrC[i]; + cudaMemcpy(curptr_d, curptr, sizeof(uint), cudaMemcpyHostToDevice); + delete curptr; + //uint curptr = colptrC[i]; + /*for(size_t j = 0; j < Adcsc_Tran->nzc; ++j) { + bool made = false; + size_t r = Adcsc_Tran->cp[j]; + for (size_t k = 0; k < nnzcolB; ++k) { + while (r < Adcsc_Tran->cp[j + 1] && Bdcsc->ir[Bdcsc->cp[i]+k] > Adcsc_Tran->ir[r]) { + r++; + } + if (r >= Adcsc_Tran->cp[j + 1]) { + break; + } + if (Bdcsc->ir[Bdcsc->cp[i]+k] == Adcsc_Tran->ir[r]) { + NTO mrhs = Adcsc_Tran->numx[r] * Bdcsc->numx[Bdcsc->cp[i]+k]; + if(true) { + if (made) { + std::get<2>(tuplesC[curptr - 1]) = std::get<2>(tuplesC[curptr - 1]) + mrhs; + } else { + made = true; + //tuplesC[curptr++] = std::make_tuple(Adcsc_Tran->jc[j], Bdcsc->jc[i], mrhs); + std::get<0>(tuplesC[curptr]) = Adcsc_Tran->jc[j]; + std::get<1>(tuplesC[curptr]) = Bdcsc->jc[i]; + std::get<2>(tuplesC[curptr++]) = mrhs; + } + } + } + } + } + //cudaDeviceSynchronize(); + }*/ + + transformColumn(Adcsc_Tran->nzc, A_Tran_CP, A_Tran_IR, A_Tran_JC, A_Tran_numx, B_CP, B_IR, B_JC, B_numx, tuplesC_d, colptrC_d, Bdcsc->nzc); + + + if(clearA) + delete const_cast *>(&A); + if(clearB) + delete const_cast *>(&B); + + + + if(deleteAux) + delete [] aux; + //std::cout << "Made it to receive" << std::endl; + IT * tuplesC_o = static_cast (::operator new (sizeof(IT[nnzc]))); + IT * tuplesC_t = static_cast (::operator new (sizeof(IT[nnzc]))); + NTO * tuplesC_th = static_cast (::operator new (sizeof(NTO[nnzc]))); + + uint * colptr_size = static_cast (::operator new (sizeof(uint[Bdcsc->nzc]))); + cudaMemcpy(tuplesC, tuplesC_d, sizeof(std::tuple[nnzc]), cudaMemcpyDeviceToHost); + gpuErrchk( cudaPeekAtLastError() ); +gpuErrchk( cudaDeviceSynchronize() ); + /*std::cout << "Made it to loop" << std::endl; + #ifdef THREADED +#pragma omp parallel for +#endif + for (IT i = 0; i < Bdcsc -> nzc; ++i) { + //std::cout << "Getting: " << i << std::endl; + for (IT j = 0; j < colptr_size[i]; ++j) { + IT in = colptrC[i] + j; + //std::cout << "Grabbed: " << j << " with " << in << std::endl; + tuplesC[in] = std::make_tuple(tuplesC_o[in], tuplesC_t[in], tuplesC_th[in]); + //printf("Made tuple at in %i, with values %i, %i, and %i", in, tuplesC_o[in], tuplesC_t[in], tuplesC_th[in]); + //std::cout << "Built!" << std::endl; + //std::cout << "Done" <* spTuplesC = new SpTuples (nnzc, mdim, ndim, tuplesC, false, true); + + //std::cout << "Made it to return" << std::endl; + // std::cout << "localspgemminfo," << flop << "," << nnzc << "," << compression_ratio << "," << t1-t0 << std::endl; + // std::cout << hashSelected << ", " << Bdcsc->nzc << ", " << (float)hashSelected / Bdcsc->nzc << std::endl; + return spTuplesC; +} +#endif // Hybrid approach of multithreaded HeapSpGEMM and HashSpGEMM template SpTuples * LocalSpGEMMHash diff --git a/include/GALATIC/GALATICMinimumIncludes.cuh b/include/GALATIC/GALATICMinimumIncludes.cuh new file mode 100644 index 00000000..1194cd73 --- /dev/null +++ b/include/GALATIC/GALATICMinimumIncludes.cuh @@ -0,0 +1,4 @@ +#pragma once +#include "../../../ext/GALATIC/include/dCSR.cuh" +#include "../../../ext/GALATIC/include/SemiRingInterface.h" +#include "../../../ext/GALATIC/source/device/Multiply.cuh" diff --git a/include/GALATIC/LICENSE b/include/GALATIC/LICENSE new file mode 100644 index 00000000..16907429 --- /dev/null +++ b/include/GALATIC/LICENSE @@ -0,0 +1,33 @@ +*** License Agreement *** + +MIT License + +GALATIC: GPU Accelerated Sparse Matrix Multiplication over Arbitrary +Semirings (GALATIC) Copyright (c) 2020-2021, The Regents of the +University of California, through Lawrence Berkeley National Laboratory +(subject to receipt of any required approvals from the U.S. Dept. of Energy), +Richard Lettich, and GPUPeople. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the +Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + +SOFTWARE. diff --git a/include/GALATIC/README.md b/include/GALATIC/README.md new file mode 100644 index 00000000..5b79b58a --- /dev/null +++ b/include/GALATIC/README.md @@ -0,0 +1,217 @@ +# GALATIC + +Sparse Matrix-Sparse Matrix Multiplication CUDA Template library over generalized semirings. + +This repository was forked from [AC-SpGEMM](https://github.com/GPUPeople/ACSpGEMM). + +This was developed/Tested with +* Linux 4.12 +* CUDA compilation tools 11.1 +* A V100 + +--- + +## Quickstart Guide + +### **Orientation** + +The headers you likely need for minimal functionality (exclusion of `CSR.cuh` is possible, if you load directly to/from GPU memory). + +```c++ +#include "GALATIC/include/CSR.cuh" +#include "GALATIC/include/dCSR.cuh" +#include "GALATIC/include/SemiRingInterface.h" +#include "GALATIC/source/device/Multiply.cuh" +``` + +Where `CSR.cuh` is used to represent matrix storage in the [Compressed Sparse Row format](https://en.wikipedia.org/wiki/Sparse_matrix) for matrices in CPU memory. `dCSR.cuh` is the same, but represents data that is stored in GPU/device memory. + +(Note: there exists a `convert` function in `dCSR.cuh` for converting between the two. The GPU version is required to perform matrix multiplication) + +We recommend you look over these files two files, as you will need to construct the input matrices yourself. + +Additionally there is a `COO.cuh` for use with the coordinate list format which can be converted to `CSR` (but not `coo` to `dCSR`). The conversion is not particularly optimized. + +### **Defining Semirings** + +To define your semiring, you statically extend the "abstract" class defined in `SemiRingInterface.h` +```C++ +// SemiRingInterface.h +template +struct SemiRing { + typedef T leftInput_t; + typedef U rightInput_t; + typedef V output_t; // Don't worry about these typedefs for now + + V multiply(const T& a, const U& b); + V add(const V& a, const V& b); + + V AdditiveIdentity(); +}; +``` + +Notice that multiplication has a left input type `T`, a right input type `U`, and an output type `V`. Addition has `V` as both an input and an output. + +An example follows where multiplication and addition are defined canonically using doubles. + +The `__device__` annotation is required. The `__host__` annotation is needed in if you would like to verify against a CPU SpGEMM implementaiton. + +``` c++ +// Define Your Semiring +struct Arith_SR : SemiRing +{ + __host__ __device__ double multiply(const double& a, const double& b) { return a * b; } + __host__ __device__ double add(const double& a, const double& b) { return a + b; } + __host__ __device__ static double AdditiveIdentity() { return 0; } +}; + +``` +You may use the "Semiring" structure (e.g. `Arith_SR`) to hold data from outside the matrix (i.e. global device memory) by storing say, a pointer. This will affect performance. + +As to be expected, only memory which is accesible from the GPU is valid. In addition, you should be careful as to not mutate anything such that data races could occur or that an order of operations becomes required. + +Use of constructors / destructors is not reccomended for your semiring struct. The destructor for this will be ran multiple times before multiplication is complete. Ideally the Semiring should be [trivally copyable](https://en.cppreference.com/w/cpp/named_req/TriviallyCopyable). Thus you must manually free resources your semiring uses (if any) after you are done. Additionally, `T`/`U`/`V` (input / output types) should also be trivially copyable. + + +### Performing Matrix Multiplication + +To decrease the chance of bad error messages, we reccomend using `SEMIRING_TYPE::leftInput_t`, `SEMIRING_TYPE::rightInput_t` and `SEMIRING_TYPE::output_t` for your matrices instead of the literal types of `T` and `U`. This will ensure any type errors occur in your code, rather than the heavily templated library codes. It will additionally help prevent errors that claim the multiplication function using your parametesr are not found. + +```C++ +CSR input_A_CPU; +CSR input_B_CPU; + +CSR result_mat_CPU; + +dCSR input_A_GPU; +dCSR input_B_GPU; + +dCSR result_mat_GPU; + + +/* ... + ... load data into input_A_CPU, input_B_CPU + ...*/ + +// Transfer input matrices onto GPU +// conver out <- in +convert(input_A_GPU, input_A_CPU); +convert(input_B_GPU, input_B_CPU); + +// load data into semiring struct. For this one, we don't need to do anything, +// but you still need to pass it in for generality. The cost is trivial. +Arith_SR semiring; + + +// Setup execution options, we'll skip the details for now. + +const int Threads = 256; +const int BlocksPerMP = 1; +const int NNZPerThread = 2; +const int InputElementsPerThreads = 2; +const int RetainElementsPerThreads = 1; +const int MaxChunksToMerge = 16; +const int MaxChunksGeneralizedMerge = 256; // MAX: 865 +const int MergePathOptions = 8; + + +GPUMatrixMatrixMultiplyTraits DefaultTraits(Threads, BlocksPerMP, NNZPerThread, + InputElementsPerThreads, RetainElementsPerThreads, + MaxChunksToMerge, MaxChunksGeneralizedMerge, MergePathOptions); + +const bool Debug_Mode = true; +DefaultTraits.preferLoadBalancing = true; +ExecutionStats stats; +stats.measure_all = false; + +// Actually perform the matrix multiplicaiton +ACSpGEMM::Multiply(input_A_GPU, input_B_GPU, result_mat_GPU, DefaultTraits, stats, Debug_Mode, semiring); + + +// load results onto CPU. +convert(result_mat_GPU, result_mat_GPU); + +``` + +A minimal working example is located in `minimal_example.cu` (note, contains different code currrently). + +compile it with + +`$ nvcc minimal_example.cu --ftemplate-backtrace-limit 1 --expt-relaxed-constexpr` + +Note: `--expt-relaxed-constexpr` is required. + + +---- + + + +### Testing +You can the output against a simple CPU version. (Matrix values, row offsets, column id's). + +Simply add the header +```cpp +#include "GALATIC/include/TestSpGEMM.cuh" +``` + +and execute + +```cpp +TestSpGEMM(input_A_GPU, input_B_GPU, semiring, [=] (const Arith_SR::output_t &a, const Arith_SR::output_t &b) { return std::abs(a-b) < 0.01; }, DefaultTraits); +``` + +Default traits is the configuration traits, as +above. + +The lambda function is function which takes two of your output type, and returns true if they are equivalent, otherwise false. + +Make sure your semiring functions are marked with `__host__`. Addditionally, if you are accessing datastructures outside the matrix, `cudaMallocManaged` is reccomended, as then both the CPU and GPU can access the memory using the same code. + +--- +## Important Information + + +AC-SpGEMM is highly configurable as can be seen with the traits in the `performTestCase`, these traits are implemented as template parameters. +Hence, for all combinations used, the **respective instantiation must be present**. +Instantiations can be created by modifying the call to `Multiply` in `source/GPU/Multiply.cu` in line 781, which is given as +```cpp +bool called = + EnumOption<256, 256, 128, // Threads + EnumOption<3, 4, 1, // BlocksPerMP + EnumOption<2, 2, 1, // NNZPerThread + EnumOption<4, 4, 1, // InputElementsPerThreads + EnumOption<4, 4, 1, // RetainElementsPerThreads + EnumOption<16, 16, 8, // MaxChunksToMerge + EnumOption<256, 512, 256, // MaxChunksGeneralizedMerge + EnumOption<8, 8, 8, // MergePathOptions + EnumOption<0, 1, 1>>>>>>>>> // DebugMode + ::call(Selection>(call), scheduling_traits.Threads, scheduling_traits.BlocksPerMp, scheduling_traits.NNZPerThread, scheduling_traits.InputElementsPerThreads, scheduling_traits.RetainElementsPerThreads, scheduling_traits.MaxChunksToMerge, scheduling_traits.MaxChunksGeneralizedMerge, scheduling_traits.MergePathOptions, (int)Debug_Mode); +``` +This expanding template will instantiate variants of `MultiplyCall` with the parameters specified in `EnumOption`, so each EnumOption describes all the possible values for a certain property and all different configurations will be instantiated (e.g. BlocksPerMP with `EnumOption<3, 4, 1,` will instantiate the template call with BlocksPerMP=3 and BlocksPerMP=4) + +These parameters may require adjusting for optimal performance, or to just run if your semiring is especially large. + +--- + +# About + +GALATIC: GPU Accelerated Sparse Matrix Multiplication over Arbitrary +Semirings (GALATIC) Copyright (c) 2020-2021, The Regents of the +University of California, through Lawrence Berkeley National Laboratory +(subject to receipt of any required approvals from the U.S. Dept. of Energy), +Richard Lettich, and GPUPeople. All rights reserved. + +If you have questions about your rights to use or distribute this software, +please contact Berkeley Lab's Intellectual Property Office at +IPO@lbl.gov. + +NOTICE. This Software was developed under funding from the U.S. Department +of Energy and the U.S. Government consequently retains certain rights. As +such, the U.S. Government has been granted for itself and others acting on +its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the +Software to reproduce, distribute copies to the public, prepare derivative +works, and perform publicly and display publicly, and to permit others to do so. + +# FAQ +richardl@berkeley.edu + diff --git a/include/GALATIC/a.out b/include/GALATIC/a.out new file mode 100755 index 00000000..4dd7fd7d Binary files /dev/null and b/include/GALATIC/a.out differ diff --git a/include/GALATIC/gmon.out b/include/GALATIC/gmon.out new file mode 100644 index 00000000..267ab310 Binary files /dev/null and b/include/GALATIC/gmon.out differ diff --git a/include/GALATIC/include/COO.cuh b/include/GALATIC/include/COO.cuh new file mode 100644 index 00000000..7d7bfd06 --- /dev/null +++ b/include/GALATIC/include/COO.cuh @@ -0,0 +1,262 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include "Vector.h" +#include +#include + +#include "Vector.h" + +#include +#include +#include +#include +#include +#include +template +struct COO +{ + size_t rows, cols, nnz; + + std::unique_ptr data; + std::unique_ptr row_ids; + std::unique_ptr col_ids; + + COO() : rows(0), cols(0), nnz(0) { } + void alloc(size_t rows, size_t cols, size_t nnz); +}; + + +namespace { + struct DataTypeValidator { + static bool validate(std::string type) { + return false; + } + }; +/* + template<> + struct DataTypeValidator { + static const bool validate(std::string type) { + return type.compare("real") == 0 || type.compare("integer") == 0; + } + }; + template + struct DataTypeValidator { + static const bool validate(std::string type) { + std::cout << "type: " << type << std::endl; + return type.compare("real") == 0 || type.compare("integer") == 0;; + } + }; + + template<> + struct DataTypeValidator { + static const bool validate(std::string type) { + return type.compare("integer") == 0; + } + }; + + template<> + struct DataTypeValidator { + static const bool validate(std::string type) { + return type.compare("integer") == 0; + } + };*/ +} + +template +void COO::alloc(size_t r, size_t c, size_t n) +{ + rows = r; + cols = c; + nnz = n; + + data = std::make_unique(n); + row_ids = std::make_unique(n); + col_ids = std::make_unique(n); +} + +template +COO loadMTX(const char * file) +{ + std::ifstream fstream(file); + if (!fstream.is_open()) + throw std::runtime_error(std::string("could not open \"") + file + "\""); + + COO resmatrix; + size_t num_rows = 0, num_columns = 0, num_non_zeroes = 0; + + size_t line_counter = 0; + std::string line; + bool pattern = false; + bool hermitian = false; + // read header; + std::getline(fstream, line); + if (line.compare(0, 32, "%%MatrixMarket matrix coordinate") != 0) + throw std::runtime_error("Can only read MatrixMarket format that is in coordinate form"); + std::istringstream iss(line); + std::vector tokens{ std::istream_iterator{iss}, std::istream_iterator{} }; + bool complex = false; + + if (tokens[3] == "pattern") + pattern = true; + else if (tokens[3] == "complex") + complex = true; + else if (tokens[3] != "real") + throw std::runtime_error("MatrixMarket data type does not match matrix format"); + bool symmetric = false; +// if (tokens[4].compare("general") == 0) + symmetric = false; + // else if (tokens[4].compare("symmetric") == 0) +// symmetric = true; + // else if (tokens[4].compare("Hermitian") == 0) + // hermitian = true; + // else + // throw std::runtime_error("Can only read MatrixMarket format that is either symmetric, general or hermitian"); + + while (std::getline(fstream, line)) + { + ++line_counter; + if (line[0] == '%') + continue; + std::istringstream liness(line); + liness >> num_rows >> num_columns >> num_non_zeroes; + if (liness.fail()) + throw std::runtime_error(std::string("Failed to read matrix market header from \"") + file + "\""); + //std::cout << "Read matrix header" << std::endl; + //std::cout << "rows: " << rows << " columns: " << columns << " nnz: " << nnz << std::endl; + break; + } + + size_t reserve = num_non_zeroes; + if (symmetric || hermitian) + reserve *= 2; + + resmatrix.alloc(num_rows, num_columns, reserve); + + //read data + size_t read = 0; + while (std::getline(fstream, line)) + { + ++line_counter; + if (line[0] == '%') + continue; + + std::istringstream liness(line); + + + do + { + char ch; + liness.get(ch); + if (!isspace(ch)) + { + liness.putback(ch); + break; + } + + } while (!liness.eof()); + if (liness.eof() || line.length() == 0) + continue; + + uint32_t r, c; + T d; + liness >> r >> c; + if (pattern) + d = 0;// T::Init(1); + else { + double a; + liness >> a; + d =0;// T::Init(a); + } + if (liness.fail()) + throw std::runtime_error(std::string("Failed to read data at line ") + std::to_string(line_counter) + " from matrix market file \"" + file + "\""); + if (r > num_rows) + throw std::runtime_error(std::string("Row index out of bounds at line ") + std::to_string(line_counter) + " in matrix market file \"" + file + "\""); + if (c > num_columns) + throw std::runtime_error(std::string("Column index out of bounds at line ") + std::to_string(line_counter) + " in matrix market file \"" + file + "\""); + + resmatrix.row_ids[read] = r - 1; + resmatrix.col_ids[read] = c - 1; + resmatrix.data[read] = d; + ++read; + if ((symmetric || hermitian) && r != c) + { + resmatrix.row_ids[read] = c - 1; + resmatrix.col_ids[read] = r - 1; + resmatrix.data[read] = d; + ++read; + } + } + + resmatrix.nnz = read; + return resmatrix; +} + + + +template +COO loadCOO(const char * file) +{ + return COO(); +} + +template +void storeCOO(const COO& mat, const char * file) +{ + +} + +template +void spmv(DenseVector& res, const COO& m, const DenseVector& v, bool transpose) +{ + if (transpose && v.size != m.rows) + throw std::runtime_error("SPMV dimensions mismatch"); + if (!transpose && v.size != m.cols) + throw std::runtime_error("SPMV dimensions mismatch"); + + size_t outsize = transpose ? m.cols : m.rows; + if (res.size < outsize) + res.data = std::make_unique(outsize); + res.size = outsize; + + std::fill(&res.data[0], &res.data[0] + outsize, 0); + + + if(transpose) + for (size_t i = 0; i < m.nnz; ++i) + res.data[m.col_ids[i]] += m.data[i] * v.data[m.row_ids[i]]; + else + for (size_t i = 0; i < m.nnz; ++i) + res.data[m.row_ids[i]] += m.data[i] * v.data[m.col_ids[i]]; +} + diff --git a/include/GALATIC/include/COO.h b/include/GALATIC/include/COO.h new file mode 100644 index 00000000..1dd8bee5 --- /dev/null +++ b/include/GALATIC/include/COO.h @@ -0,0 +1,62 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include "Vector.h" + +#include + +namespace GALATIC { +template +struct COO +{ + size_t rows, cols, nnz; + + std::unique_ptr data; + std::unique_ptr row_ids; + std::unique_ptr col_ids; + + COO() : rows(0), cols(0), nnz(0) { } + void alloc(size_t rows, size_t cols, size_t nnz); +}; + +template +COO loadMTX(const char* file); +template +COO loadCOO(const char* file); +template +void storeCOO(const COO& mat, const char* file); + +template +void spmv(DenseVector& res, const COO& m, const DenseVector& v, bool transpose = false); + +} diff --git a/include/GALATIC/include/CPU_SpGEMM.h b/include/GALATIC/include/CPU_SpGEMM.h new file mode 100644 index 00000000..98e54e4c --- /dev/null +++ b/include/GALATIC/include/CPU_SpGEMM.h @@ -0,0 +1,104 @@ +#include + +#include +#include "CSR.cuh" +#include "dCSR.cuh" + +#pragma once + + template + using Vec = std::vector; + + template + struct CSR_Tuple { + uint64_t col; + T value; + CSR_Tuple(uint64_t col, T value) : col(col), value(value) {} + }; + + + template + void Mult_CPU( CSR &A, CSR &B, CSR& C, SEMIRING_t& sr) + { + + + Vec> result = Vec>(); + Vec row_starts = Vec(); + + int last_percent = 0; + + Vec> temp_buffer = Vec>(); + + + for (uint64_t A_row_idx = 0; A_row_idx < A.rows; A_row_idx++) + { + if (A_row_idx*10 / A.rows > last_percent) { + std::cout << "CPU Done%: " << A_row_idx*100 / A.rows <= A.rows ? A.nnz : A.row_offsets[A_row_idx+1]; + + temp_buffer.clear(); + // for every element A_r,k in row A_row_idx + for (uint64_t A_element_idx = A_row_start; A_element_idx < A_row_end; A_element_idx++) + { + const LEFT_T &A_element = A.data[A_element_idx]; + + // for every element B_k,c + + + uint64_t A_col_idx = A.col_ids[A_element_idx]; + + uint64_t B_row_start = B.row_offsets[A_col_idx]; + uint64_t B_row_end = A_col_idx + 1 >= B.rows ? B.nnz : B.row_offsets[A_col_idx+1]; + + + for (uint64_t c_star = B_row_start; c_star < B_row_end; c_star++){ + const RIGHT_T & B_element = B.data[c_star]; + uint64_t b_col = B.col_ids[c_star]; + auto jq =sr.multiply(A_element, B_element); + temp_buffer.push_back(CSR_Tuple(b_col, jq )); + } + + + } + + std::sort( + temp_buffer.begin(), + temp_buffer.end(), + [] (const CSR_Tuple &a, const CSR_Tuple &b) { return a.col < b.col; } + ); + + + int64_t last_col = -1; + row_starts.push_back(result.size()); + for (auto & ele : temp_buffer) { + if (ele.col != last_col) { + result.push_back(ele); + } else { + result[result.size() -1] = CSR_Tuple(ele.col, sr.add(result[result.size() -1].value, ele.value)); + } + last_col = ele.col; + } + } + + C.alloc(A.rows,B.cols, result.size()); + + for (int i = 0; i < result.size(); i++) { + C.data[i] = result.at(i).value; + C.col_ids[i] = result.at(i).col; + } + + row_starts.push_back(result.size()); + + C.row_offsets[0] =0; + for (int i = 0; i < A.rows+1; i++) { + C.row_offsets[i] = row_starts.at(i); + } + + } \ No newline at end of file diff --git a/include/GALATIC/include/CSR.cuh b/include/GALATIC/include/CSR.cuh new file mode 100644 index 00000000..41d0072b --- /dev/null +++ b/include/GALATIC/include/CSR.cuh @@ -0,0 +1,338 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include +#include +#include +#include + +#include "COO.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#pragma once + +template +struct COO; + +template +struct DenseVector; + +template +struct CSR +{ + struct Statistics + { + double mean; + double std_dev; + size_t max; + size_t min; + }; + + void computeStatistics(double& mean, double& std_dev, size_t& max, size_t& min) + { + // running variance by Welford + size_t count = 0; + mean = 0; + double M2 = 0; + max = 0; + min = cols; + for (size_t i = 0; i < rows; ++i) + { + size_t r_length = row_offsets[i + 1] - row_offsets[i]; + min = std::min(min, r_length); + max = std::max(max, r_length); + ++count; + double newValue = static_cast(r_length); + double delta = newValue - mean; + mean = mean + delta / count; + double delta2 = newValue - mean; + M2 = M2 + delta * delta2; + } + if (count < 2) + std_dev = 0; + else + std_dev = sqrt(M2 / (count - 1)); + } + + Statistics rowStatistics() + { + Statistics stats; + computeStatistics(stats.mean, stats.std_dev, stats.max, stats.min); + return stats; + } + + size_t rows, cols, nnz; + + std::unique_ptr data; + std::unique_ptr row_offsets; + std::unique_ptr col_ids; + + CSR() : rows(0), cols(0), nnz(0), data(std::unique_ptr(new T[0])) { + } + void alloc(size_t rows, size_t cols, size_t nnz); + + // CSR& operator=(CSR other) + // { + // this->rows = other.rows; + // this->cols = other.cols; + // this->nnz = other.nnz; + // this->data = std::move(other.data); + // this->row_offsets = std::move(other.row_offsets); + // this->col_ids = std::move(other.col_ids); + // return *this; + // } + + // CSR(const CSR& other) + // { + // this->rows = other.rows; + // this->cols = other.cols; + // this->nnz = other.nnz; + // this->data = std::make_unique(other.nnz); + // memcpy(this->data.get(), other.data.get(), sizeof(T) * other.nnz); + // this->col_ids = std::make_unique(other.nnz); + // memcpy(this->col_ids.get(), other.col_ids.get(), sizeof(unsigned int) * other.nnz); + // this->row_offsets = std::make_unique(other.rows + 1); + // memcpy(this->row_offsets.get(), other.row_offsets.get(), sizeof(unsigned int) * (other.rows + 1)); + // } + +}; + + + +namespace { + template + struct State + { + typedef VALUE_TYPE ValueType; + + bool transpose; + + State() : transpose(false) { } + State(bool transpose) : transpose(transpose) { } + }; + + struct CSRIOHeader + { + static constexpr char Magic[] = { 'H','i', 1, 'C','o','m','p','s','d' }; + + char magic[sizeof(Magic)]; + uint64_t typesize; + uint64_t compresseddir; + uint64_t indexsize; + uint64_t fixedoffset; + uint64_t offsetsize; + uint64_t num_rows, num_columns; + uint64_t num_non_zeroes; + + CSRIOHeader() = default; + + + template + static uint64_t typeSize() + { + return sizeof(T); + } + + template + CSRIOHeader(const CSR& mat) + { + for (size_t i = 0; i < sizeof(Magic); ++i) + magic[i] = Magic[i]; + typesize = typeSize(); + compresseddir = 0; + indexsize = typeSize(); + fixedoffset = 0; + offsetsize = typeSize(); + + num_rows = mat.rows; + num_columns = mat.cols; + num_non_zeroes = mat.nnz; + } + + bool checkMagic() const + { + for (size_t i = 0; i < sizeof(Magic); ++i) + if (magic[i] != Magic[i]) + return false; + return true; + } + }; + constexpr char CSRIOHeader::Magic[]; +} + +template +void CSR::alloc(size_t r, size_t c, size_t n) +{ + rows = r; + cols = c; + nnz = n; + + data = std::make_unique(n); + col_ids = std::make_unique(n); + row_offsets = std::make_unique(r+1); +} + +template +CSR loadCSR(const char * file) +{ + std::ifstream fstream(file, std::fstream::binary); + if (!fstream.is_open()) + throw std::runtime_error(std::string("could not open \"") + file + "\""); + + CSRIOHeader header; + State state; + fstream.read(reinterpret_cast(&header), sizeof(CSRIOHeader)); + if (!fstream.good()) + throw std::runtime_error("Could not read CSR header"); + if (!header.checkMagic()) + throw std::runtime_error("File does not appear to be a CSR Matrix"); + + fstream.read(reinterpret_cast(&state), sizeof(state)); + if (!fstream.good()) + throw std::runtime_error("Could not read CompressedMatrix state"); + if (header.typesize != CSRIOHeader::typeSize()) + throw std::runtime_error("File does not contain a CSR matrix with matching type"); + + CSR res; + res.alloc(header.num_rows, header.num_columns, header.num_non_zeroes); + + fstream.read(reinterpret_cast(&res.data[0]), res.nnz * sizeof(T)); + fstream.read(reinterpret_cast(&res.col_ids[0]), res.nnz * sizeof(unsigned int)); + fstream.read(reinterpret_cast(&res.row_offsets[0]), (res.rows+1) * sizeof(unsigned int)); + + if (!fstream.good()) + throw std::runtime_error("Could not read CSR matrix data"); + + return res; +} + +template +void storeCSR(const CSR& mat, const char * file) +{ + std::ofstream fstream(file, std::fstream::binary); + if (!fstream.is_open()) + throw std::runtime_error(std::string("could not open \"") + file + "\""); + + CSRIOHeader header(mat); + State state; + fstream.write(reinterpret_cast(&header), sizeof(CSRIOHeader)); + fstream.write(reinterpret_cast(&state), sizeof(state)); + fstream.write(reinterpret_cast(&mat.data[0]), mat.nnz * sizeof(T)); + fstream.write(reinterpret_cast(&mat.col_ids[0]), mat.nnz * sizeof(unsigned int)); + fstream.write(reinterpret_cast(&mat.row_offsets[0]), (mat.rows + 1) * sizeof(unsigned int)); + +} + +template +void spmv(DenseVector& res, const CSR& m, const DenseVector& v, bool transpose) +{ + if (transpose && v.size != m.rows) + throw std::runtime_error("SPMV dimensions mismatch"); + if (!transpose && v.size != m.cols) + throw std::runtime_error("SPMV dimensions mismatch"); + + size_t outsize = transpose ? m.cols : m.rows; + if (res.size < outsize) + res.data = std::make_unique(outsize); + res.size = outsize; + + if (transpose) + { + std::fill(&res.data[0], &res.data[0] + m.cols, 0); + for (size_t i = 0; i < m.rows; ++i) + { + for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o) + res.data[m.col_ids[o]] += m.data[o] * v.data[i]; + } + } + else + { + for (size_t i = 0; i < m.rows; ++i) + { + T val = 0; + for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o) + val += m.data[o] * v.data[m.col_ids[o]]; + res.data[i] = val; + } + } +} + +template +void convert(CSR& res, const COO& coo) +{ + struct Entry + { + unsigned int r, c; + T v; + bool operator < (const Entry& other) + { + if (r != other.r) + return r < other.r; + return c < other.c; + } + }; + + std::vector entries; + std::cout << coo.nnz << std::endl; + entries.reserve(coo.nnz); + for (size_t i = 0; i < coo.nnz; ++i) + entries.push_back(Entry{ coo.row_ids[i], coo.col_ids[i], coo.data[i] }); + std::sort(std::begin(entries), std::end(entries)); + + res.alloc(coo.rows, coo.cols, coo.nnz); + std::fill(&res.row_offsets[0], &res.row_offsets[coo.rows], 0); + for (size_t i = 0; i < coo.nnz; ++i) + { + res.data[i] = entries[i].v; + res.col_ids[i] = entries[i].c; + ++res.row_offsets[entries[i].r]; + } + + unsigned int off = 0; + for (size_t i = 0; i < coo.rows; ++i) + { + unsigned int n = off + res.row_offsets[i]; + res.row_offsets[i] = off; + off = n; + } + res.row_offsets[coo.rows] = off; +} diff --git a/include/GALATIC/include/CSR.h b/include/GALATIC/include/CSR.h new file mode 100644 index 00000000..dd0444fd --- /dev/null +++ b/include/GALATIC/include/CSR.h @@ -0,0 +1,340 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include +#include +#include +#include +#include "COO.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace GALATIC { +template +struct COO; + +template +struct DenseVector; + +template +struct CSR +{ + struct Statistics + { + double mean; + double std_dev; + size_t max; + size_t min; + }; + + void computeStatistics(double& mean, double& std_dev, size_t& max, size_t& min) + { + // running variance by Welford + size_t count = 0; + mean = 0; + double M2 = 0; + max = 0; + min = cols; + for (size_t i = 0; i < rows; ++i) + { + size_t r_length = row_offsets[i + 1] - row_offsets[i]; + min = std::min(min, r_length); + max = std::max(max, r_length); + ++count; + double newValue = static_cast(r_length); + double delta = newValue - mean; + mean = mean + delta / count; + double delta2 = newValue - mean; + M2 = M2 + delta * delta2; + } + if (count < 2) + std_dev = 0; + else + std_dev = sqrt(M2 / (count - 1)); + } + + Statistics rowStatistics() + { + Statistics stats; + computeStatistics(stats.mean, stats.std_dev, stats.max, stats.min); + return stats; + } + + size_t rows, cols, nnz; + + std::unique_ptr data; + std::unique_ptr row_offsets; + std::unique_ptr col_ids; + + CSR() : rows(0), cols(0), nnz(0) { } + void alloc(size_t rows, size_t cols, size_t nnz); + + // CSR& operator=(CSR other) + // { + // this->rows = other.rows; + // this->cols = other.cols; + // this->nnz = other.nnz; + // this->data = std::move(other.data); + // this->row_offsets = std::move(other.row_offsets); + // this->col_ids = std::move(other.col_ids); + // return *this; + // } + + // CSR(const CSR& other) + // { + // this->rows = other.rows; + // this->cols = other.cols; + // this->nnz = other.nnz; + // this->data = std::make_unique(other.nnz); + // memcpy(this->data.get(), other.data.get(), sizeof(T) * other.nnz); + // this->col_ids = std::make_unique(other.nnz); + // memcpy(this->col_ids.get(), other.col_ids.get(), sizeof(unsigned int) * other.nnz); + // this->row_offsets = std::make_unique(other.rows + 1); + // memcpy(this->row_offsets.get(), other.row_offsets.get(), sizeof(unsigned int) * (other.rows + 1)); + // } + +}; + + + +namespace { + template + struct State + { + typedef VALUE_TYPE ValueType; + + ValueType scaling; + bool transpose; + + State() : scaling(1), transpose(false) { } + State(ValueType scaling, bool transpose) : scaling(scaling), transpose(transpose) { } + }; + + struct CSRIOHeader + { + static constexpr char Magic[] = { 'H','i', 1, 'C','o','m','p','s','d' }; + + char magic[sizeof(Magic)]; + uint64_t typesize; + uint64_t compresseddir; + uint64_t indexsize; + uint64_t fixedoffset; + uint64_t offsetsize; + uint64_t num_rows, num_columns; + uint64_t num_non_zeroes; + + CSRIOHeader() = default; + + + template + static uint64_t typeSize() + { + return sizeof(T); + } + + template + CSRIOHeader(const CSR& mat) + { + for (size_t i = 0; i < sizeof(Magic); ++i) + magic[i] = Magic[i]; + typesize = typeSize(); + compresseddir = 0; + indexsize = typeSize(); + fixedoffset = 0; + offsetsize = typeSize(); + + num_rows = mat.rows; + num_columns = mat.cols; + num_non_zeroes = mat.nnz; + } + + bool checkMagic() const + { + for (size_t i = 0; i < sizeof(Magic); ++i) + if (magic[i] != Magic[i]) + return false; + return true; + } + }; + constexpr char CSRIOHeader::Magic[]; +} + +template +void CSR::alloc(size_t r, size_t c, size_t n) +{ + rows = r; + cols = c; + nnz = n; + + data = std::make_unique(n); + col_ids = std::make_unique(n); + row_offsets = std::make_unique(r+1); +} + +template +CSR loadCSR(const char * file) +{ + std::ifstream fstream(file, std::fstream::binary); + if (!fstream.is_open()) + throw std::runtime_error(std::string("could not open \"") + file + "\""); + + CSRIOHeader header; + State state; + fstream.read(reinterpret_cast(&header), sizeof(CSRIOHeader)); + if (!fstream.good()) + throw std::runtime_error("Could not read CSR header"); + if (!header.checkMagic()) + throw std::runtime_error("File does not appear to be a CSR Matrix"); + + fstream.read(reinterpret_cast(&state), sizeof(state)); + if (!fstream.good()) + throw std::runtime_error("Could not read CompressedMatrix state"); + if (header.typesize != CSRIOHeader::typeSize()) + throw std::runtime_error("File does not contain a CSR matrix with matching type"); + + CSR res; + res.alloc(header.num_rows, header.num_columns, header.num_non_zeroes); + + fstream.read(reinterpret_cast(&res.data[0]), res.nnz * sizeof(T)); + fstream.read(reinterpret_cast(&res.col_ids[0]), res.nnz * sizeof(unsigned int)); + fstream.read(reinterpret_cast(&res.row_offsets[0]), (res.rows+1) * sizeof(unsigned int)); + + if (!fstream.good()) + throw std::runtime_error("Could not read CSR matrix data"); + + return res; +} + +template +void storeCSR(const CSR& mat, const char * file) +{ + std::ofstream fstream(file, std::fstream::binary); + if (!fstream.is_open()) + throw std::runtime_error(std::string("could not open \"") + file + "\""); + + CSRIOHeader header(mat); + State state; + fstream.write(reinterpret_cast(&header), sizeof(CSRIOHeader)); + fstream.write(reinterpret_cast(&state), sizeof(state)); + fstream.write(reinterpret_cast(&mat.data[0]), mat.nnz * sizeof(T)); + fstream.write(reinterpret_cast(&mat.col_ids[0]), mat.nnz * sizeof(unsigned int)); + fstream.write(reinterpret_cast(&mat.row_offsets[0]), (mat.rows + 1) * sizeof(unsigned int)); + +} + +template +void spmv(DenseVector& res, const CSR& m, const DenseVector& v, bool transpose) +{ + if (transpose && v.size != m.rows) + throw std::runtime_error("SPMV dimensions mismatch"); + if (!transpose && v.size != m.cols) + throw std::runtime_error("SPMV dimensions mismatch"); + + size_t outsize = transpose ? m.cols : m.rows; + if (res.size < outsize) + res.data = std::make_unique(outsize); + res.size = outsize; + + if (transpose) + { + std::fill(&res.data[0], &res.data[0] + m.cols, 0); + for (size_t i = 0; i < m.rows; ++i) + { + for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o) + res.data[m.col_ids[o]] += m.data[o] * v.data[i]; + } + } + else + { + for (size_t i = 0; i < m.rows; ++i) + { + T val = 0; + for (unsigned int o = m.row_offsets[i]; o < m.row_offsets[i+1]; ++o) + val += m.data[o] * v.data[m.col_ids[o]]; + res.data[i] = val; + } + } +} + +template +void convert(CSR& res, const COO& coo) +{ + struct Entry + { + unsigned int r, c; + T v; + bool operator < (const Entry& other) + { + if (r != other.r) + return r < other.r; + return c < other.c; + } + }; + + std::vector entries; + std::cout << coo.nnz << std::endl; + entries.reserve(coo.nnz); + for (size_t i = 0; i < coo.nnz; ++i) + entries.push_back(Entry{ coo.row_ids[i], coo.col_ids[i], coo.data[i] }); + std::sort(std::begin(entries), std::end(entries)); + + res.alloc(coo.rows, coo.cols, coo.nnz); + std::fill(&res.row_offsets[0], &res.row_offsets[coo.rows], 0); + for (size_t i = 0; i < coo.nnz; ++i) + { + res.data[i] = entries[i].v; + res.col_ids[i] = entries[i].c; + ++res.row_offsets[entries[i].r]; + } + + unsigned int off = 0; + for (size_t i = 0; i < coo.rows; ++i) + { + unsigned int n = off + res.row_offsets[i]; + res.row_offsets[i] = off; + off = n; + } + res.row_offsets[coo.rows] = off; +} + + +}; diff --git a/include/GALATIC/include/ColorText.h b/include/GALATIC/include/ColorText.h new file mode 100644 index 00000000..8c939162 --- /dev/null +++ b/include/GALATIC/include/ColorText.h @@ -0,0 +1,25 @@ +#include +#pragma once +namespace Color { + enum Code { + FG_RED = 31, + FG_GREEN = 32, + FG_YELLOW = 93, + + FG_BLUE = 34, + FG_DEFAULT = 39, + BG_RED = 41, + BG_GREEN = 42, + BG_BLUE = 44, + BG_DEFAULT = 49 + }; + class Modifier { + Code code; + public: + Modifier(Code pCode) : code(pCode) {} + friend std::ostream& + operator<<(std::ostream& os, const Modifier& mod) { + return os << "\033[" << mod.code << "m"; + } + }; +} \ No newline at end of file diff --git a/include/GALATIC/include/Compare.cuh b/include/GALATIC/include/Compare.cuh new file mode 100644 index 00000000..a577e314 --- /dev/null +++ b/include/GALATIC/include/Compare.cuh @@ -0,0 +1,109 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ +* Compare.h +* +* ac-SpGEMM +* +* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter +*------------------------------------------------------------------------------ +*/ + +#pragma once + +#include "dCSR.cuh" +#include +#include "common.h" + + +namespace ACSpGEMM { + + template + __global__ void d_compare(int in_rows, int in_cols, const uint32_t *__restrict reference_offset, + const uint32_t *__restrict reference_indices, const DataType *__restrict reference_values, + const uint32_t *__restrict compare_offset, const uint32_t *__restrict compare_indices, + const DataType *__restrict compare_values, bool compare_data, double epsilon, + uint32_t *verification) { + int tid = threadIdx.x + blockDim.x * blockIdx.x; + if (tid >= in_rows) + return; + + uint32_t ref_offset = reference_offset[tid]; + uint32_t comp_offset = compare_offset[tid]; + uint32_t ref_number_entries = reference_offset[tid + 1] - ref_offset; + uint32_t comp_number_entries = compare_offset[tid + 1] - comp_offset; + + if (ref_number_entries != comp_number_entries) { +#ifdef VERIFICATION_TEXT + printf("---------- Row: %u | Row length not identical: (Ref|Comp) : (%u|%u)\n",tid, ref_number_entries, comp_number_entries); +#endif + *verification = 1; + } + + uint32_t num_entries = min(ref_number_entries, comp_number_entries); + + for (uint32_t i = 0; i < num_entries; ++i) { + if (reference_indices[ref_offset + i] != compare_indices[comp_offset + i]) { +#ifdef VERIFICATION_TEXT + printf("Row: %u | Row indices do NOT match: (Ref|Comp) : (%u|%u) - pos: %u/%u\n", tid, reference_indices[ref_offset + i], compare_indices[comp_offset + i], i, num_entries); +#endif + *verification = 1; + } + if (compare_data) { + if (!(reference_values[ref_offset + i] == compare_values[comp_offset + i])) { +#ifdef VERIFICATION_TEXT + printf("Row: %u | Values do NOT match: (Ref|Comp) : (%f|%f) - pos: %u/%u\n", tid, reference_values[ref_offset + i], compare_values[comp_offset + i], i, num_entries); +#endif + *verification = 1; + } + } + } + } + + template + bool Compare(const dCSR &reference_mat, const dCSR &compare_mat, bool compare_data) { + int blockSize(256); + int gridSize(divup(reference_mat.rows + 1, blockSize)); + double epsilon = 0.1; + uint32_t *verification, h_verification; + cudaMalloc(&verification, sizeof(uint32_t)); + cudaMemset(verification, 0, sizeof(uint32_t)); + + d_compare <<< gridSize, blockSize >>> (reference_mat.rows, reference_mat.cols, + reference_mat.row_offsets, reference_mat.col_ids, reference_mat.data, + compare_mat.row_offsets, compare_mat.col_ids, compare_mat.data, + compare_data, epsilon, verification); + + cudaMemcpy(&h_verification, verification, sizeof(uint32_t), cudaMemcpyDeviceToHost); + return (h_verification == 0); + } +} \ No newline at end of file diff --git a/include/GALATIC/include/Compare.h b/include/GALATIC/include/Compare.h new file mode 100644 index 00000000..1fc2cce9 --- /dev/null +++ b/include/GALATIC/include/Compare.h @@ -0,0 +1,123 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ +* Compare.h +* +* ac-SpGEMM +* +* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter +*------------------------------------------------------------------------------ +*/ + +#pragma once + +#include "dCSR.h" + +#include "common.h" + + +namespace ACSpGEMM { + + template + __global__ void d_compare(int in_rows, int in_cols, const uint32_t* __restrict reference_offset, const uint32_t* __restrict reference_indices, const DataType* __restrict reference_values, + const uint32_t* __restrict compare_offset, const uint32_t* __restrict compare_indices, const DataType* __restrict compare_values, bool compare_data, double epsilon, uint32_t* verification) + { + int tid = threadIdx.x + blockDim.x * blockIdx.x; + if (tid >= in_rows) + return; + + uint32_t ref_offset = reference_offset[tid]; + uint32_t comp_offset = compare_offset[tid]; + uint32_t ref_number_entries = reference_offset[tid + 1] - ref_offset; + uint32_t comp_number_entries = compare_offset[tid + 1] - comp_offset; + + if (ref_number_entries != comp_number_entries) + { +#ifdef VERIFICATION_TEXT + printf("---------- Row: %u | Row length not identical: (Ref|Comp) : (%u|%u)\n",tid, ref_number_entries, comp_number_entries); +#endif + *verification = 1; + } + + uint32_t num_entries = min(ref_number_entries, comp_number_entries); + + for (uint32_t i = 0; i < num_entries; ++i) + { + if (reference_indices[ref_offset + i] != compare_indices[comp_offset + i]) + { +#ifdef VERIFICATION_TEXT + printf("Row: %u | Row indices do NOT match: (Ref|Comp) : (%u|%u) - pos: %u/%u\n", tid, reference_indices[ref_offset + i], compare_indices[comp_offset + i], i, num_entries); +#endif + *verification = 1; + } + if (compare_data) + { + if (reference_values[ref_offset + i] != compare_values[comp_offset + i]) + { +#ifdef VERIFICATION_TEXT + printf("Row: %u | Values do NOT match: (Ref|Comp) : (%f|%f) - pos: %u/%u\n", tid, reference_values[ref_offset + i], compare_values[comp_offset + i], i, num_entries); +#endif + *verification = 1; + } + } + } + + return; + } + template bool Compare(const dCSR& reference_mat, const dCSR& compare_mat, bool compare_data); + template + bool Compare(const dCSR& reference_mat, const dCSR& compare_mat, bool compare_data) + { + int blockSize(256); + int gridSize(divup(reference_mat.rows + 1, blockSize)); + double epsilon = 0.1; + uint32_t* verification, h_verification; + cudaMalloc(&verification, sizeof(uint32_t)); + cudaMemset(verification, 0, sizeof(uint32_t)); + + d_compare << > > (reference_mat.rows, reference_mat.cols, + reference_mat.row_offsets, reference_mat.col_ids, reference_mat.data, + compare_mat.row_offsets, compare_mat.col_ids, compare_mat.data, + compare_data, epsilon, verification); + + cudaMemcpy(&h_verification, verification, sizeof(uint32_t), cudaMemcpyDeviceToHost); + return (h_verification == 0); + } +//// +// template bool Compare(const dCSR& reference_mat, const dCSR& compare_mat, bool compare_data); +// template bool Compare(const dCSR& reference_mat, const dCSR& compare_mat, bool compare_data); +// template bool Compare( dCSR const& reference_mat, dCSR const& compare_mat, bool compare_data); + // template bool Compare( dCSR const& reference_mat, dCSR const& compare_mat, bool compare_data); + +} +// +// diff --git a/include/GALATIC/include/CustomExceptions.h b/include/GALATIC/include/CustomExceptions.h new file mode 100644 index 00000000..53003f28 --- /dev/null +++ b/include/GALATIC/include/CustomExceptions.h @@ -0,0 +1,85 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ +* CustomExceptions.h +* +* ac-SpGEMM +* +* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter +*------------------------------------------------------------------------------ +*/ +#pragma once + +#include + +class SpGEMMException : public std::exception +{ +public: + virtual char const * what() const noexcept{ return "SpGEMM: Stage failed"; } +}; + +class MergeSimpleCaseException : public std::exception +{ +public: +virtual char const * what() const noexcept { return "MERGE: Simple Case failed"; } +}; + +class MergeMaxChunksCaseException : public std::exception +{ +public: + virtual char const * what() const noexcept { return "MERGE: Max Chunks Case failed"; } +}; + +class MergeGeneralizedCaseException : public std::exception +{ +public: + virtual char const * what() const noexcept { return "MERGE: Generalized Case failed"; } +}; + +class MergeLoopingException : public std::exception +{ +public: + virtual char const * what() const noexcept { return "MERGE: Merge Stage took longer than 10 seconds"; } +}; + +class RestartOutOfMemoryException : public std::exception +{ +public: + virtual char const * what() const noexcept { return "RESTART: SpGEMM out of memory"; } +}; + +class RestartOutOfChunkPointerException : public std::exception +{ +public: + virtual char const * what() const noexcept { return "RESTART: SpGEMM out of chunk pointers"; } +}; + diff --git a/include/GALATIC/include/GALATIC.cuh b/include/GALATIC/include/GALATIC.cuh new file mode 100644 index 00000000..0088231d --- /dev/null +++ b/include/GALATIC/include/GALATIC.cuh @@ -0,0 +1,49 @@ + +#pragma once +namespace GALATIC { +template +void convert(CSR& dst, const dCSR& src, unsigned int padding=0) +{ + dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); + dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; + cudaMemcpy(dst.data.get(), src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(dst.col_ids.get(), src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaMemcpy(dst.row_offsets.get(), src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost); +} + +}; + + + + +// template +// cusp::csr_matrix< IDX_t, VALUE_t, cusp::device_memory> to_cusp_csr( CSR& orig_mat) +// { +// cusp::csr_matrix result_cpu(orig_mat.rows, orig_mat.cols, orig_mat.nnz); + +// for (int i = 0; i < orig_mat.rows; i++) { +// result_cpu.row_offsets[i] = orig_mat.row_offsets[i]; +// } + +// for (int i = 0; i < orig_mat.nnz; i++) { +// result_cpu.column_indices[i] = orig_mat.col_ids[i]; +// result_cpu.values[i] = orig_mat.data[i]; +// } + +// cusp::csr_matrix result(result_cpu); +// return result; +// } + + +// template +// void CuspMultiplyWrapper(cusp::csr_matrix< IDX_t, typename SEMIRING_t::input_t, cusp::device_memory>& A, +// cusp::csr_matrix< IDX_t, typename SEMIRING_t::input_t, cusp::device_memory>& B, +// cusp::csr_matrix< IDX_t, typename SEMIRING_t::output_t, cusp::device_memory>& C, +// SEMIRING_t sr) { +// cusp::multiply( A,B,C, [] __device__ (typename SEMIRING_t::output_t a ) {return SEMIRING_t::AdditiveIdentity(); }, +// [sr] __device__ ( typename SEMIRING_t::input_t a, typename SEMIRING_t::input_t b) {return sr.multiply(a,b); }, +// [sr] __device__ ( typename SEMIRING_t::output_t a, typename SEMIRING_t::output_t b) { +// auto q= sr.add(a,b); +// return q; } ); + +// } \ No newline at end of file diff --git a/include/GALATIC/include/GALATIC.h b/include/GALATIC/include/GALATIC.h new file mode 100644 index 00000000..c37626d7 --- /dev/null +++ b/include/GALATIC/include/GALATIC.h @@ -0,0 +1,50 @@ + +#include +#include + + +namespace GALATIC { +template +void convert(CSR& dst, const dCSR& src, unsigned int padding=0) +{ + dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); + dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; + cudaMemcpy(dst.data.get(), src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(dst.col_ids.get(), src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaMemcpy(dst.row_offsets.get(), src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost); +} + +}; + +template +cusp::csr_matrix< IDX_t, VALUE_t, cusp::device_memory> to_cusp_csr( CSR& orig_mat) +{ + cusp::csr_matrix result_cpu(orig_mat.rows, orig_mat.cols, orig_mat.nnz); + + for (int i = 0; i < orig_mat.rows; i++) { + result_cpu.row_offsets[i] = orig_mat.row_offsets[i]; + } + + for (int i = 0; i < orig_mat.nnz; i++) { + result_cpu.column_indices[i] = orig_mat.col_ids[i]; + result_cpu.values[i] = orig_mat.data[i]; + } + + cusp::csr_matrix result(result_cpu); + return result; +} + + + + +template +void CuspMultiplyWrapper(cusp::csr_matrix< IDX_t, typename SEMIRING_t::input_t, cusp::device_memory>& A, + cusp::csr_matrix< IDX_t, typename SEMIRING_t::input_t, cusp::device_memory>& B, + cusp::csr_matrix< IDX_t, typename SEMIRING_t::output_t, cusp::device_memory>& C, + SEMIRING_t sr) { + cusp::multiply(A,B,C, __device__ [] (auto a) { return a;}, + __device__ [sr] (thrust::device_reference& a const, thrust::device_reference& b const) {return sr.multiply(a,b); }, + __device__ [sr](typename SEMIRING_t::output_t& a const, typename SEMIRING_t::output_t & b const ) {return sr.add(a,b); } ); + + +} \ No newline at end of file diff --git a/include/GALATIC/include/MergeCaseOffsets.h b/include/GALATIC/include/MergeCaseOffsets.h new file mode 100644 index 00000000..d48364e0 --- /dev/null +++ b/include/GALATIC/include/MergeCaseOffsets.h @@ -0,0 +1,49 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include + +// To circumnavigate a problem with nvcc and std::tuple as described here: https://devtalk.nvidia.com/default/topic/1028112/cuda-setup-and-installation/nvcc-bug-related-to-gcc-6-lt-tuple-gt-header-/ + +struct MergeCaseOffsets{ + size_t shared_rows_simple; + size_t shared_rows_max_chunks; + size_t shared_rows_generalized; + size_t shared_rows_simple_rows; + + MergeCaseOffsets(): + shared_rows_simple(0), shared_rows_max_chunks(0), shared_rows_generalized(0), shared_rows_simple_rows(0){} + + MergeCaseOffsets(size_t simple, size_t max, size_t generalized, size_t simple_rows): + shared_rows_simple(simple), shared_rows_max_chunks(max), shared_rows_generalized(generalized), shared_rows_simple_rows(simple_rows){} +}; \ No newline at end of file diff --git a/include/GALATIC/include/Multiply.h b/include/GALATIC/include/Multiply.h new file mode 100644 index 00000000..9823360d --- /dev/null +++ b/include/GALATIC/include/Multiply.h @@ -0,0 +1,58 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ +* Multiply.h +* +* ac-SpGEMM +* +* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter +*------------------------------------------------------------------------------ +*/ +#pragma once + +#include "dCSR.h" +#include "execution_stats.h" +#include "default_scheduling_traits.h" + +static void HandleError( cudaError_t err, + const char *file, + int line ) { + if (err != cudaSuccess) { + printf( "%s in %s at line %d\n", cudaGetErrorString( err ), + file, line ); + std::cout << std::flush; + throw std::exception(); + } +} +#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) + + diff --git a/include/GALATIC/include/SemiRingInterface.h b/include/GALATIC/include/SemiRingInterface.h new file mode 100644 index 00000000..61e17ff4 --- /dev/null +++ b/include/GALATIC/include/SemiRingInterface.h @@ -0,0 +1,33 @@ +// +// Created by Richard Lettich on 4/13/21. +// + +#ifndef ACSPGEMM_FUNCTION_INTERFACE_H +#define ACSPGEMM_FUNCTION_INTERFACE_H + +template +struct SemiRing { + typedef T leftInput_t; + typedef U rightInput_t; + typedef V output_t; + + V multiply(const T& a, const U& b); + V add(const V& a, const V& b); + + V AdditiveIdentity(); +}; + + + +#endif //ACSPGEMM_FUNCTION_INTERFACE_H + + + + + + + + + + + diff --git a/include/GALATIC/include/TestSpGEMM.cuh b/include/GALATIC/include/TestSpGEMM.cuh new file mode 100644 index 00000000..d65c14c6 --- /dev/null +++ b/include/GALATIC/include/TestSpGEMM.cuh @@ -0,0 +1,112 @@ + +#include +#include "CPU_SpGEMM.h" +#include "CSR.cuh" +#include "dCSR.cuh" +#include "../source/device/Multiply.cuh" + +template +void TestSpGEMM( dCSR& A, dCSR& B, SEMIRING_T semiring, F equiv_rel, GPUMatrixMatrixMultiplyTraits& traits) +{ + + //bool checkBitStability{true}; + ExecutionStats stats, warmupstats, output_stats; + stats.measure_all = false; + output_stats.measure_all = false; + + dCSR result_mat; + + std::cout << "starting GPU matrix multiply" << std::endl; + + ACSpGEMM::Multiply(A, B, result_mat, traits, warmupstats, true, semiring); + cudaDeviceSynchronize(); + std::cout << "GPU matrix multiply Done" << std::endl; + + + + // Convert input matrices + + CSR A_cpu; + CSR B_cpu; + + convert(A_cpu, A); + + convert(B_cpu, B); + + cudaDeviceSynchronize(); + + //convert gpu result to cpu + CSR GPU_result_cpu; + cudaDeviceSynchronize(); + + convert(GPU_result_cpu, result_mat); + + cudaDeviceSynchronize(); + + + CSR CPU_result_cpu; + Mult_CPU(A_cpu, B_cpu, CPU_result_cpu, semiring); + + std::cout << "Checking = # Rows, Cols, NNZ...."; + assert(CPU_result_cpu.rows == GPU_result_cpu.rows); + std::cout << "Cpu "<< CPU_result_cpu.cols << "gpu " << GPU_result_cpu.cols; + assert(CPU_result_cpu.cols == GPU_result_cpu.cols); + assert(CPU_result_cpu.nnz == GPU_result_cpu.nnz); + + std::cout << " correct" << std::endl; + + std::cout << "Checking Equivalency for non zeros..."; + + + int correct = 0; + for (int i = 0; i < CPU_result_cpu.nnz; i++) { + if (equiv_rel(CPU_result_cpu.data[i], GPU_result_cpu.data[i])) { + correct++; + } + } + + std::cout << "num correct " << correct << "/ " << CPU_result_cpu.nnz << std::endl; + assert(correct == CPU_result_cpu.nnz); + + std::cout << " correct" << std::endl; + + + std::cout << "Checking Equivalency for Column Id's..."; + + + + + int correct_col_ids = 0; + for (int i = 0; i < CPU_result_cpu.nnz; i++) { + if (CPU_result_cpu.col_ids[i] == GPU_result_cpu.col_ids[i]) { + correct_col_ids++; + } + } + + assert(correct_col_ids == CPU_result_cpu.nnz); + + std::cout << " correct" << std::endl; + + std::cout << "Checking Equivalency for Row offsets's..."; + + int cor_row_ids = 0; + + + + + for (int i = 0; i < CPU_result_cpu.rows+1; i++) { + if (CPU_result_cpu.row_offsets[i] == GPU_result_cpu.row_offsets[i]) { + cor_row_ids++; + + } else { + std::cout << " issue at " << i<< " with " << CPU_result_cpu.row_offsets[i] << " vs "<< GPU_result_cpu.row_offsets[i]<< std::endl; + } + } + + std::cout << cor_row_ids << " correct out of " << CPU_result_cpu.rows+1 << std::endl; + + assert(cor_row_ids == CPU_result_cpu.rows+1); + std::cout << " correct" << std::endl; + std::cout << "correctness check complete" << std::endl; + +} \ No newline at end of file diff --git a/include/GALATIC/include/Transpose.h b/include/GALATIC/include/Transpose.h new file mode 100644 index 00000000..6b89a617 --- /dev/null +++ b/include/GALATIC/include/Transpose.h @@ -0,0 +1,157 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ +* Transpose.h +* +* ac-SpGEMM +* +* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter +*------------------------------------------------------------------------------ +*/ + +#pragma once + +#include "dCSR.cuh" + + +// Global includes +#include +#include +#include "device_launch_parameters.h" + +// Local includes +#include "common.h" + +__global__ void d_calulateTransposeDistribution(int in_rows, int in_cols, + const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices, uint32_t* output_offset) +{ + int tid = threadIdx.x + blockDim.x * blockIdx.x; + if (tid >= in_rows) + return; + + uint32_t offset = input_offset[tid]; + uint32_t number_entries = input_offset[tid + 1] - offset; + + for (uint32_t i = 0; i < number_entries; ++i) + { + atomicAdd(output_offset + input_indices[offset + i], 1); + } + + return; +} + +template +__global__ void d_findPosition(int in_rows, int in_cols, const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices, + const DataType* __restrict input_values, uint32_t* output_offset, uint32_t* output_indices, DataType* output_values, uint32_t* helper, uint32_t* helper_position) +{ + int tid = threadIdx.x + blockDim.x * blockIdx.x; + if (tid >= in_rows) + return; + + uint32_t offset = input_offset[tid]; + uint32_t number_entries = input_offset[tid + 1] - offset; + + for (uint32_t i = 0; i < number_entries; ++i) + { + uint32_t row_index = input_indices[offset + i]; + uint32_t insert_position = atomicAdd(helper + row_index, 1); + uint32_t o_offset = output_offset[row_index]; + helper_position[o_offset + insert_position] = tid; + } + + return; +} + +template +__global__ void d_writeTranspose(int in_rows, int in_cols, const uint32_t* __restrict input_offset, const uint32_t* __restrict input_indices, + const DataType* __restrict input_values, uint32_t* output_offset, uint32_t* output_indices, DataType* output_values, uint32_t* helper, uint32_t* helper_position) +{ + int tid = threadIdx.x + blockDim.x * blockIdx.x; + if (tid >= in_rows) + return; + + uint32_t offset = input_offset[tid]; + uint32_t number_entries = input_offset[tid + 1] - offset; + + for (uint32_t i = 0; i < number_entries; ++i) + { + uint32_t row_index = input_indices[offset + i]; + uint32_t actual_position(0); + uint32_t entries_output = helper[row_index]; + uint32_t o_offset = output_offset[row_index]; + for (uint32_t j = 0; j < entries_output; ++j) + { + if (helper_position[o_offset + j] < tid) + ++actual_position; + } + output_indices[o_offset + actual_position] = tid; + output_values[o_offset + actual_position] = input_values[offset + i]; + } + + return; +} + + + template + void Transpose(const dCSR& matIn, dCSR& matTransposeOut) + { + int blockSize(256); + int gridSize(divup(matIn.rows + 1, blockSize)); + + matTransposeOut.alloc(matIn.cols, matIn.rows, matIn.nnz); + + // Allocate and set helper resources, Memset output vector + uint32_t* d_helper_pointer, *d_helper_position; + cudaMalloc(&d_helper_pointer, sizeof(uint32_t) * (matTransposeOut.rows + 1)); + cudaMalloc(&d_helper_position, sizeof(uint32_t) * (matTransposeOut.nnz)); + cudaMemset(d_helper_pointer, 0, sizeof(uint32_t) * (matTransposeOut.rows + 1)); + cudaMemset(matTransposeOut.row_offsets, 0, (matTransposeOut.rows + 1) * sizeof(uint32_t)); + + // Calculate entry distribution + d_calulateTransposeDistribution<<>>(matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matTransposeOut.row_offsets); + + // Prefix sum for new offset vector + thrust::device_ptr th_offset_vector(matTransposeOut.row_offsets); + thrust::exclusive_scan(th_offset_vector, th_offset_vector + matTransposeOut.rows + 1, th_offset_vector); + + // Find position for insertion (keeping sort order) + d_findPosition <<>> (matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matIn.data, matTransposeOut.row_offsets, matTransposeOut.col_ids, matTransposeOut.data, d_helper_pointer, d_helper_position); + + // Write Transpose + d_writeTranspose <<>> (matIn.rows, matIn.cols, matIn.row_offsets, matIn.col_ids, matIn.data, matTransposeOut.row_offsets, matTransposeOut.col_ids, matTransposeOut.data, d_helper_pointer, d_helper_position); + + // Free helper resources + cudaFree(d_helper_pointer); + cudaFree(d_helper_position); + + return; + } diff --git a/include/GALATIC/include/Vector.h b/include/GALATIC/include/Vector.h new file mode 100644 index 00000000..5457359a --- /dev/null +++ b/include/GALATIC/include/Vector.h @@ -0,0 +1,48 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include + +template +struct DenseVector +{ + size_t size; + std::unique_ptr data; + + DenseVector() : size(0) { } + void alloc(size_t s) + { + data = std::make_unique(s); + size = s; + } +}; diff --git a/include/GALATIC/include/common.cuh b/include/GALATIC/include/common.cuh new file mode 100644 index 00000000..948be63d --- /dev/null +++ b/include/GALATIC/include/common.cuh @@ -0,0 +1,319 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + + +#include "meta_utils.h" +#include "common.h" +#include +#include + + + +/////////////// HELPERS ///////////////////////// + +const uint32_t WARP_SIZE = 32; + + + + +template +struct ConditionalIteration +{ + template + __device__ + static void iterate(F f) + { + bool res = f(BEGIN); + if (res) + ConditionalIteration::iterate(f); + } +}; + +template +struct ConditionalIteration +{ + template + __device__ + static void iterate(F f) + { + } +}; + + +template +struct VecLoadTypeImpl; + +template<> +struct VecLoadTypeImpl<4> +{ + using type = unsigned int; +}; +template<> +struct VecLoadTypeImpl<8> +{ + using type = uint2; +}; +template<> +struct VecLoadTypeImpl<16> +{ + using type = uint4; +}; + +template +struct VecLoadType +{ + using type = typename VecLoadTypeImpl::type; + union + { + T data[N]; + type vec; + }; + + __device__ __forceinline__ VecLoadType() = default; + __device__ __forceinline__ VecLoadType(type v) : vec(v) {}; +}; + +template +__device__ __forceinline__ void warp_load_vectorized(T (&out)[N], const T* in) +{ + static_assert(static_popcnt::value == 1, "load_vectorized only works for pow 2 elements"); + + using LoadType = VecLoadType; + const typename LoadType::type* vec_in = reinterpret_cast(in + (threadIdx.x/WARP_SIZE)*WARP_SIZE*N) + laneid(); + + //TODO: get rid of UB by doing an explicit unroll and just use the vec type + #pragma unroll + for (int i = 0; i < N / VecSize; ++i) + { + LoadType loaded; + loaded.vec = vec_in[i*WARP_SIZE]; + #pragma unroll + for (int j = 0; j < VecSize; ++j) + out[i*VecSize + j] = loaded.data[j]; + } +} + +template +__device__ __forceinline__ void vectorized_to_blocked(T(&data)[N]) +{ + const int Vecs = N / VecSize; + + //rotate + #pragma unroll + for (int k = 0; k < Vecs - 1; ++k) + { + if (laneid() % Vecs > k) + { + T tmp[VecSize]; + #pragma unroll + for (int i = 0; i < VecSize; ++i) + tmp[i] = data[(Vecs - 1)*VecSize + i]; + + #pragma unroll + for (int j = Vecs - 1; j > 0; --j) + #pragma unroll + for (int i = 0; i < VecSize; ++i) + data[j*VecSize + i] = data[(j - 1)*VecSize + i]; + + #pragma unroll + for (int i = 0; i < VecSize; ++i) + data[i] = tmp[i]; + } + } + + //shfl + int pad_offset = Vecs - (laneid() * Vecs) / WARP_SIZE; + int section_offset = (laneid() * Vecs) % WARP_SIZE; + + #pragma unroll + for (int j = 0; j < Vecs; ++j) + { + int shfl_offset = section_offset + ((pad_offset + j) % Vecs); + #pragma unroll + for (int i = 0; i < VecSize; ++i) + data[j*VecSize + i] = __shfl(data[j*VecSize + i], shfl_offset); + } + + //rotate back + #pragma unroll + for (int k = 0; k < Vecs - 1; ++k) + { + if ((laneid() * Vecs) / WARP_SIZE > k) + { + T tmp[VecSize]; + #pragma unroll + for (int i = 0; i < VecSize; ++i) + tmp[i] = data[i]; + + #pragma unroll + for (int j = 1; j < Vecs; ++j) + #pragma unroll + for (int i = 0; i < VecSize; ++i) + data[(j - 1)*VecSize + i] = data[j*VecSize + i]; + + #pragma unroll + for (int i = 0; i < VecSize; ++i) + data[(Vecs - 1)*VecSize + i] = tmp[i]; + } + } +} + + +template +struct ThreadOddEvenMerge; + +template +struct ThreadOddEvenMergeImpl; + +template +__device__ __forceinline__ void swap(T& a, T& b) +{ + T temp = a; + a = b; + b = temp; +} + +template +struct ThreadOddEvenMergeImpl +{ + template + __device__ __forceinline__ static void run(K(&key)[L]) + { + ThreadOddEvenMerge::run(key); + ThreadOddEvenMerge::run(key); +#pragma unroll + for (int i = LO + R; i + R < LO + N; i += M) + if (COMP::comp(key[i], key[i + R])) + swap(key[i], key[i + R]); + } + template + __device__ __forceinline__ static void run(K(&key)[L], V(&value)[L]) + { + ThreadOddEvenMerge::run(key, value); + ThreadOddEvenMerge::run(key, value); +#pragma unroll + for (int i = LO + R; i + R < LO + N; i += M) + if (COMP::comp(key[i], key[i + R])) + swap(key[i], key[i + R]), + swap(value[i], value[i + R]); + } +}; +template +struct ThreadOddEvenMergeImpl +{ + template + __device__ __forceinline__ static void run(K(&key)[L]) + { + if (COMP::comp(key[LO], key[LO + R])) + swap(key[LO], key[LO + R]); + } + template + __device__ __forceinline__ static void run(K(&key)[L], V(&value)[L]) + { + if (COMP::comp(key[LO], key[LO + R])) + swap(key[LO], key[LO + R]), + swap(value[LO], value[LO + R]); + } +}; + + +template +struct ThreadOddEvenMerge : public ThreadOddEvenMergeImpl +{ +}; + +template +struct ThreadOddEvenMergeSort +{ + template + __device__ __forceinline__ static void run(K(&key)[L]) + { + ThreadOddEvenMergeSort::run(key); + ThreadOddEvenMergeSort::run(key); + ThreadOddEvenMerge::run(key); + } + template + __device__ __forceinline__ static void run(K(&key)[L], V(&value)[L]) + { + ThreadOddEvenMergeSort::run(key, value); + ThreadOddEvenMergeSort::run(key, value); + ThreadOddEvenMerge::run(key, value); + } +}; + +template +struct ThreadOddEvenMergeSort +{ + template + __device__ __forceinline__ static void run(K (&key)[L]) + { } + template + __device__ __forceinline__ static void run(K (&key)[L], V(&value)[L]) + { } +}; + +template +__device__ __forceinline__ void threadOddEvenMergeSort(K(&key)[L]) +{ + ThreadOddEvenMergeSort::run(key); +} +template +__device__ __forceinline__ void threadOddEvenMergeSort(K(&key)[L], V(&value)[L]) +{ + ThreadOddEvenMergeSort::run(key, value); +} + +struct SortAscending +{ + template + __device__ __forceinline__ static bool comp(T a, T b) + { + return a > b; + } +}; + +struct SortDescending +{ + template + __device__ __forceinline__ static bool comp(T a, T b) + { + return a < b; + } +}; + +__device__ __forceinline__ inline uint32_t laneid() +{ + uint32_t mylaneid; + asm("mov.u32 %0, %laneid;" : "=r" (mylaneid)); + return mylaneid; +} diff --git a/include/GALATIC/include/common.h b/include/GALATIC/include/common.h new file mode 100644 index 00000000..f2e43944 --- /dev/null +++ b/include/GALATIC/include/common.h @@ -0,0 +1,46 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include + +template +constexpr __host__ __device__ __forceinline__ T divup(T a, T b) +{ + return (a + b - 1) / b; +} + +template +constexpr __host__ __device__ __forceinline__ T alignment(const T size, size_t alignment) +{ + return divup(size, alignment) * alignment; +} diff --git a/include/GALATIC/include/consistent_memory.h b/include/GALATIC/include/consistent_memory.h new file mode 100644 index 00000000..7e2e0c1e --- /dev/null +++ b/include/GALATIC/include/consistent_memory.h @@ -0,0 +1,109 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include +#include +#include "memory_space.h" + +namespace ACSpGEMM { + class RegisteredMemory + { + public: + virtual size_t clear() = 0; + }; + + inline std::vector& getRegMemories() + { + static std::vector m; + return m; + } + + inline void register_consistent_memory(RegisteredMemory* memory) + { + getRegMemories().push_back(memory); + } + inline void unregister_consistent_memory(RegisteredMemory* memory) + { + auto &m = getRegMemories(); + std::remove(begin(m), end(m), memory); + } + inline size_t clear_consistentMemory() + { + size_t s = 0; + for (auto m : getRegMemories()) + s += m->clear(); + return s; + } + + template + class ConsistentMemory; + + template + class RegisteredMemoryVar : RegisteredMemory + { + T v; + size_t clear() override + { + v = 0; + return 0; + } + public: + RegisteredMemoryVar() : v(0) + { + register_consistent_memory(this); + } + explicit RegisteredMemoryVar(T v) : v(v) + { + register_consistent_memory(this); + } + ~RegisteredMemoryVar() + { + unregister_consistent_memory(this); + } + + RegisteredMemoryVar& operator+= (T add) + { + v += add; + return *this; + } + + void operator = (T other) + { + v = other; + } + operator T() const noexcept + { + return v; + } + }; +} diff --git a/include/GALATIC/include/dCSR.cuh b/include/GALATIC/include/dCSR.cuh new file mode 100644 index 00000000..a67cea6e --- /dev/null +++ b/include/GALATIC/include/dCSR.cuh @@ -0,0 +1,163 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once +#include "CSR.cuh" +#include +#include +#include +#pragma once + + +template +struct CSR; + +template +struct dCSR +{ + size_t rows, cols, nnz; + + T* data; + unsigned int* row_offsets; + unsigned int* col_ids; + + dCSR() : rows(0), cols(0), nnz(0), data(nullptr), row_offsets(nullptr), col_ids(nullptr) { } + void alloc(size_t rows, size_t cols, size_t nnz, bool allocOffsets = true); + void reset(); + ~dCSR(); +}; + + + +namespace +{ + template + void dealloc(dCSR& mat) + { + cudaPointerAttributes attr; + //cudaPointerGetAttributes(&attr, mat.col_ids); + //if (attr.type == 2) { + if (mat.col_ids != nullptr) + cudaFree(mat.col_ids); + //} + //cudaPointerGetAttributes(&attr, mat.data); + //if (attr.type == 2) + if (mat.data != nullptr) + cudaFree(mat.data); + //cudaPointerGetAttributes(&attr, mat.row_offsets); + //if (attr.type == 2) + if (mat.row_offsets != nullptr) + cudaFree(mat.row_offsets); + + mat.nnz = 0; + mat.col_ids = nullptr; + mat.data = nullptr; + mat.row_offsets = nullptr; + //if(cudaSuccess != cudaGetLastError()) std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl; + + } +} + +template +void dCSR::alloc(size_t r, size_t c, size_t n, bool allocOffsets) +{ + dealloc(*this); + rows = r; + cols = c; + nnz = n; + cudaMalloc(&data, sizeof(T)*n); + cudaMalloc(&col_ids, sizeof(unsigned int)*n); + if (allocOffsets) + cudaMalloc(&row_offsets, sizeof(unsigned int)*(r+1)); +} +template +dCSR::~dCSR() +{ + dealloc(*this); +} + +template +void dCSR::reset() +{ + dealloc(*this); +} + + +template +void convert(dCSR& dst, const CSR& src) +{ + unsigned int padding=0; + dst.alloc(src.rows + padding, src.cols, src.nnz + 8*padding); + dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; + cudaMemcpy(dst.data, &src.data[0], src.nnz * sizeof(T), cudaMemcpyHostToDevice); + cudaMemcpy(dst.col_ids, &src.col_ids[0], src.nnz * sizeof(unsigned int), cudaMemcpyHostToDevice); + cudaMemcpy(dst.row_offsets, &src.row_offsets[0], (src.rows + 1) * sizeof(unsigned int), cudaMemcpyHostToDevice); + + if (padding) + { + cudaMemset(dst.data + src.nnz, 0, 8 * padding * sizeof(T)); + cudaMemset(dst.col_ids + src.nnz, 0, 8 * padding * sizeof(unsigned int)); + cudaMemset(dst.row_offsets + src.rows + 1, 0, padding * sizeof(unsigned int)); + } +} + +template +void convert(CSR& dst, const dCSR& src) +{ + unsigned int padding= 0; + dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); + dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; + cudaMemcpy(dst.data.get(), src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(dst.col_ids.get(), src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaMemcpy(dst.row_offsets.get(), src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost); +} + +template +void convert(dCSR& dst, const dCSR& src) +{ + unsigned int padding=0; + dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); + dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; + cudaMemcpy(dst.data, src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToDevice); + cudaMemcpy(dst.col_ids, src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToDevice); + cudaMemcpy(dst.row_offsets, src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToDevice); +} + +template +void convert(CSR& dst, const CSR& src) +{ + unsigned int padding=0; + dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); + dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; + memcpy(dst.data.get(), src.data.get(), dst.nnz * sizeof(T)); + memcpy(dst.col_ids.get(), src.col_ids.get(), dst.nnz * sizeof(unsigned int)); + memcpy(dst.row_offsets.get(), src.row_offsets.get(), (dst.rows + 1) * sizeof(unsigned int)); +} diff --git a/include/GALATIC/include/dCSR.h b/include/GALATIC/include/dCSR.h new file mode 100644 index 00000000..1e9506d3 --- /dev/null +++ b/include/GALATIC/include/dCSR.h @@ -0,0 +1,158 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once +#include "CSR.h" +#include +#include +#include + +namespace GALATIC { +template +struct dCSR +{ + size_t rows, cols, nnz; + + T* data; + unsigned int* row_offsets; + unsigned int* col_ids; + + dCSR() : rows(0), cols(0), nnz(0), data(nullptr), row_offsets(nullptr), col_ids(nullptr) { } + void alloc(size_t rows, size_t cols, size_t nnz, bool allocOffsets = true); + void reset(); + ~dCSR(); +}; + + + +namespace +{ + template + void dealloc(dCSR& mat) + { + if (mat.col_ids != nullptr) + cudaFree(mat.col_ids); + if (mat.data != nullptr) + cudaFree(mat.data); + if (mat.row_offsets != nullptr) + cudaFree(mat.row_offsets); + mat.col_ids = nullptr; + mat.data = nullptr; + mat.row_offsets = nullptr; + } +} + +template +void dCSR::alloc(size_t r, size_t c, size_t n, bool allocOffsets) +{ + dealloc(*this); + rows = r; + cols = c; + nnz = n; + cudaMalloc(&data, sizeof(T)*n); + cudaMalloc(&col_ids, sizeof(unsigned int)*n); + if (allocOffsets) + cudaMalloc(&row_offsets, sizeof(unsigned int)*(r+1)); +} +template +dCSR::~dCSR() +{ + dealloc(*this); +} + +template +void dCSR::reset() +{ + dealloc(*this); +} + + +template +void convert(dCSR& dst, const CSR& src, unsigned int padding) +{ + dst.alloc(src.rows + padding, src.cols, src.nnz + 8*padding); + dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; + cudaMemcpy(dst.data, &src.data[0], src.nnz * sizeof(T), cudaMemcpyHostToDevice); + cudaMemcpy(dst.col_ids, &src.col_ids[0], src.nnz * sizeof(unsigned int), cudaMemcpyHostToDevice); + cudaMemcpy(dst.row_offsets, &src.row_offsets[0], (src.rows + 1) * sizeof(unsigned int), cudaMemcpyHostToDevice); + + if (padding) + { + cudaMemset(dst.data + src.nnz, 0, 8 * padding * sizeof(T)); + cudaMemset(dst.col_ids + src.nnz, 0, 8 * padding * sizeof(unsigned int)); + cudaMemset(dst.row_offsets + src.rows + 1, 0, padding * sizeof(unsigned int)); + } +} + +template +void convert(CSR& dst, const dCSR& src, unsigned int padding) +{ + dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); + dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; + cudaMemcpy(dst.data.get(), src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(dst.col_ids.get(), src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaMemcpy(dst.row_offsets.get(), src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost); +} + +template +void convert(dCSR& dst, const dCSR& src) +{ + unsigned int padding = 0; + dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); + dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; + cudaMemcpy(dst.data, src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToDevice); + cudaMemcpy(dst.col_ids, src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToDevice); + cudaMemcpy(dst.row_offsets, src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToDevice); +} + +template +void convert(CSR& dst, const CSR& src) +{ + unsigned int padding = 0; + dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); + dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; + memcpy(dst.data.get(), src.data.get(), dst.nnz * sizeof(T)); + memcpy(dst.col_ids.get(), src.col_ids.get(), dst.nnz * sizeof(unsigned int)); + memcpy(dst.row_offsets.get(), src.row_offsets.get(), (dst.rows + 1) * sizeof(unsigned int)); +} + +template +void convert(CSR& dst, const dCSR& src) +{ + unsigned int padding= 0; + dst.alloc(src.rows + padding, src.cols, src.nnz + 8 * padding); + dst.rows = src.rows; dst.nnz = src.nnz; dst.cols = src.cols; + cudaMemcpy(dst.data.get(), src.data, dst.nnz * sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(dst.col_ids.get(), src.col_ids, dst.nnz * sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaMemcpy(dst.row_offsets.get(), src.row_offsets, (dst.rows + 1) * sizeof(unsigned int), cudaMemcpyDeviceToHost); +} + +}; diff --git a/include/GALATIC/include/dVector.h b/include/GALATIC/include/dVector.h new file mode 100644 index 00000000..d4b8be56 --- /dev/null +++ b/include/GALATIC/include/dVector.h @@ -0,0 +1,74 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include "Vector.h" +#include + +template +struct dDenseVector +{ + size_t size; + T* data; + + dDenseVector() : size(0), data(nullptr) { } + void alloc(size_t s) + { + if (data != nullptr) + cudaFree(data); + cudaMalloc(&data, sizeof(T)*s); + size = s; + } + ~dDenseVector() + { + if (data != nullptr) + cudaFree(data); + } +}; + +template +void convert(dDenseVector & dvec, const DenseVector& vec, unsigned int padding = 0) +{ + dvec.alloc(vec.size+padding); + dvec.size = vec.size; + + cudaMemcpy(dvec.data, &vec.data[0], dvec.size * sizeof(T), cudaMemcpyHostToDevice); + if (padding) + cudaMemset(dvec.data + dvec.size, 0, padding * sizeof(T)); +} + +template +void convert(DenseVector & vec, const dDenseVector& dvec) +{ + vec.alloc(dvec.size); + cudaMemcpy(&vec.data[0], dvec.data, dvec.size * sizeof(T), cudaMemcpyDeviceToHost); +} \ No newline at end of file diff --git a/include/GALATIC/include/default_scheduling_traits.h b/include/GALATIC/include/default_scheduling_traits.h new file mode 100644 index 00000000..47ea9946 --- /dev/null +++ b/include/GALATIC/include/default_scheduling_traits.h @@ -0,0 +1,80 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + + +struct GeneralSchedulingTraits +{ + static const bool MultiGPU = false; + + bool preferLoadBalancing; + size_t cpu_threads; + int device; + + GeneralSchedulingTraits() : cpu_threads(8), device(0), preferLoadBalancing(true) { } +}; + +struct AVX2SchedulingTratis : public GeneralSchedulingTraits{}; + +struct DefaultSchedulingTraits : public GeneralSchedulingTraits {}; + +struct GPUMatrixMatrixMultiplyTraits : public GeneralSchedulingTraits +{ + const int Threads; + const int BlocksPerMp; + const int NNZPerThread; + const int InputElementsPerThreads; + const int RetainElementsPerThreads; + const int MaxChunksToMerge; + const int MaxChunksGeneralizedMerge; + const int MergePathOptions; + + + GPUMatrixMatrixMultiplyTraits( + const int Threads = 256, + const int BlocksPerMp = 3, + const int NNZPerThread = 2, + const int InputElementsPerThreads = 4, + const int RetainElementsPerThreads = 4, + const int MaxChunksToMerge = 16, + const int MaxChunksGeneralizedMerge = 256, + const int MergePathOptions = 8) : + Threads(Threads), + BlocksPerMp(BlocksPerMp), + NNZPerThread(NNZPerThread), + InputElementsPerThreads(InputElementsPerThreads), + RetainElementsPerThreads(RetainElementsPerThreads), + MaxChunksToMerge(MaxChunksToMerge), + MaxChunksGeneralizedMerge(MaxChunksGeneralizedMerge), + MergePathOptions(MergePathOptions) + {} +}; diff --git a/include/GALATIC/include/device/ARowStorage.cuh b/include/GALATIC/include/device/ARowStorage.cuh new file mode 100644 index 00000000..d3c6c4f0 --- /dev/null +++ b/include/GALATIC/include/device/ARowStorage.cuh @@ -0,0 +1,173 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * ARowStorage.cuh + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once + +#include "../meta_utils.h" + + +template +class ARowStorage; + +template +class ARowStorage +{ + INDEX_TYPE row_ids[NNZ_PER_BLOCK]; + +public: + using EncodedRowType = INDEX_TYPE; + + __device__ __forceinline__ + void clear() + { + #pragma unroll + for (uint32_t i = 0; i < NNZ_PER_BLOCK; i += THREADS) + row_ids[i + threadIdx.x] = 0; + } + + __device__ __forceinline__ + void storeReference(uint32_t id, INDEX_TYPE row) + { + } + + __device__ __forceinline__ + void storeRow(uint32_t id, uint32_t ref, INDEX_TYPE row) + { + row_ids[id] = row; + //printf("direct %d stores row: %d %d %d -> %d gets row %d\n", threadIdx.x, id, ref, row, id, row); + } + __device__ __forceinline__ + void storeEncodedRow(uint32_t id, INDEX_TYPE row) + { + row_ids[id] = row; + } + + __device__ __forceinline__ + INDEX_TYPE getEncodedRow(uint32_t id) + { + //printf("direct %d req encoded row: %d (which is -> %d)\n", threadIdx.x, id, row_ids[id]); + return row_ids[id]; + } + + __device__ __forceinline__ + INDEX_TYPE decodeRow(INDEX_TYPE row) + { + //printf("direct %d decodes row: %d -> %d\n", threadIdx.x, row, row); + return row; + } + + __device__ __forceinline__ + static INDEX_TYPE restartRowDecode(uint32_t restart_row, INDEX_TYPE first_row) + { + return first_row + restart_row; + } + __device__ __forceinline__ + static uint32_t restartRowEncode(INDEX_TYPE row, INDEX_TYPE first_row) + { + return row - first_row; + } +}; + +template +class ARowStorage +{ + using ReferenceType = ChooseBitDataType::value>::value>; + INDEX_TYPE row_ids[NNZ_PER_BLOCK]; + ReferenceType references[NNZ_PER_BLOCK]; + + +public: + + using EncodedRowType = uint32_t; + + __device__ __forceinline__ + void clear() + { + #pragma unroll + for (uint32_t i = 0; i < NNZ_PER_BLOCK; i += THREADS) + references[i + threadIdx.x] = 0; + } + + + __device__ __forceinline__ + void storeReference(EncodedRowType id, INDEX_TYPE row) + { + row_ids[id] = row; + //printf("%d stores ref: %d %d -> %d gets real row %d\n", threadIdx.x, id, row, id, row); + } + + __device__ __forceinline__ + void storeRow(uint32_t id, EncodedRowType ref, INDEX_TYPE row) + { + references[id] = static_cast(ref); + //printf("%d stores row: %d %d %d -> %d gets ref %d\n", threadIdx.x, id, ref, row, id, ref); + } + + __device__ __forceinline__ + void storeEncodedRow(uint32_t id, EncodedRowType ref) + { + references[id] = static_cast(ref); + } + + __device__ __forceinline__ + EncodedRowType getEncodedRow(uint32_t id) + { + //printf("%d req encoded row: %d (which is %d -> %d)\n", threadIdx.x, id, references[id], row_ids[references[id]]); + return references[id]; + } + + __device__ __forceinline__ + INDEX_TYPE decodeRow(EncodedRowType row) + { + //printf("%d decodes row: %d -> %d\n", threadIdx.x, row, row_ids[row]); + return row_ids[row]; + } + + __device__ __forceinline__ + static INDEX_TYPE restartRowDecode(EncodedRowType restart_row, INDEX_TYPE first_row) + { + return restart_row; + } + __device__ __forceinline__ + static uint32_t restartRowEncode(EncodedRowType row, INDEX_TYPE first_row) + { + return row; + } +}; \ No newline at end of file diff --git a/include/GALATIC/include/device/Chunk.cuh b/include/GALATIC/include/device/Chunk.cuh new file mode 100644 index 00000000..e0064f31 --- /dev/null +++ b/include/GALATIC/include/device/Chunk.cuh @@ -0,0 +1,290 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * Chunk.cuh + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once +#include "../common.h" + +using ChunkSortType = uint32_t; +const int chunk_member_offset = alignment(sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(ChunkSortType), 8); + + +template +struct alignas(16) Chunk +{ + // with which row does the chunk start + INDEX_TYPE firstrow; + // the number of matrix entries and column offsets in the chunk + uint32_t num_entries; + // where does the last (uncompleted) row in the chunk start + uint32_t last_row_count; + // how many elements are in the first row + uint32_t first_row_count; + // sortkey + ChunkSortType sort_key; + + + + __device__ __forceinline__ Chunk(uint32_t num, INDEX_TYPE firstrow, uint32_t firstrowCount = 0, uint32_t lastrowCount = 0, ChunkSortType sortkey = 0) : + firstrow(firstrow), num_entries(num), last_row_count(lastrowCount), first_row_count(firstrowCount), sort_key(sortkey) + { + + } + + __device__ __forceinline__ static uint32_t size(uint32_t count, bool nextPointers) + { + uint32_t s = (nextPointers ? 16 : 0) + count*(sizeof(VALUE_TYPE) + sizeof(INDEX_TYPE)) + sizeof(Chunk); + return (s + 15) & 0xFFFFFFF0; + + } + __device__ __forceinline__ + static Chunk* place(void* chunks, uint32_t offset, uint32_t num, INDEX_TYPE firstrow, uint32_t firstrowCount = 0, uint32_t lastrowCount = 0, ChunkSortType sortkey = 0) + { + return new(reinterpret_cast(chunks) + offset) Chunk(num, firstrow, firstrowCount, lastrowCount, sortkey); + } + __device__ __forceinline__ + static Chunk* cast(void* chunks, uint32_t offset) + { + return reinterpret_cast(reinterpret_cast(chunks) + offset); + } + //__device__ __forceinline__ void write(void* location) const + //{ + // *reinterpret_cast(location) = *reinterpret_cast(this); + //} + + __device__ __forceinline__ VALUE_TYPE* values_direct(uint32_t count) + { + return reinterpret_cast(reinterpret_cast(this) + chunk_member_offset); + } + __device__ __forceinline__ INDEX_TYPE* indices_direct(uint32_t count) + { + return reinterpret_cast(reinterpret_cast(this) + chunk_member_offset + sizeof(VALUE_TYPE)*count); + } + + __device__ __forceinline__ const VALUE_TYPE* values_direct(uint32_t count) const + { + return reinterpret_cast(reinterpret_cast(this) + chunk_member_offset); + } + __device__ __forceinline__ const INDEX_TYPE* indices_direct(uint32_t count) const + { + return reinterpret_cast(reinterpret_cast(this) + chunk_member_offset + sizeof(VALUE_TYPE)*count); + } + + __device__ __forceinline__ void writeNextFront(Chunk* next) + { + *reinterpret_cast(reinterpret_cast(this) - 16) = next; + } + + __device__ __forceinline__ void writeNextBack(Chunk* next) + { + *reinterpret_cast(reinterpret_cast(this) - 8) = next; + } + + __device__ __forceinline__ void writeNextPointer(Chunk* next, bool front) + { + *reinterpret_cast(reinterpret_cast(this) - 16 + (front ? 0 : 8)) = next; + } + + __device__ __forceinline__ Chunk* readNextFront() const + { + return *reinterpret_cast(reinterpret_cast(this) - 16); + } + + __device__ __forceinline__ Chunk* readNextBack() const + { + return *reinterpret_cast(reinterpret_cast(this) - 8); + } + + __device__ __forceinline__ void setLastConsumed() + { + last_row_count = last_row_count | 0x80000000; + } + __device__ __forceinline__ void setFirstConsumed() + { + first_row_count = first_row_count | 0x80000000; + } + + static const uint32_t StartingOffsetFlag = 0x40000000; + + __device__ __forceinline__ uint32_t startingoffset() const + { + if ((first_row_count & StartingOffsetFlag) == StartingOffsetFlag) + return first_row_count & 0x3FFFFFFF; + return 0; + } + + __device__ __forceinline__ bool lastConsumed() const + { + return (last_row_count & 0x80000000) != 0; + } + __device__ __forceinline__ bool firstConsumed() const + { + return (first_row_count & 0x80000000) != 0; + } + + __device__ __forceinline__ uint32_t lastCountCleared() const + { + return last_row_count & (~0x80000000); + } + __device__ __forceinline__ uint32_t firstCountCleared() const + { + return first_row_count & (~0xC0000000); + } + + __device__ __forceinline__ VALUE_TYPE getMultiplier() const + { + return 1; + } + __device__ __forceinline__ bool isDirect() const + { + return last_row_count == 0xFFFFFFFF; + } +}; + +template +__device__ __forceinline__ bool allocChunk(uint32_t count, uint32_t* chunk_alloc, uint32_t chunk_size, uint32_t& offset, int& worstcaseRem, bool nextPointers = true) +{ + uint32_t s = Chunk::size(count, nextPointers); + worstcaseRem -= s; + offset = atomicAdd(chunk_alloc, s) + (nextPointers ? 16 : 0); + return offset + s <= chunk_size; +} + +template +__device__ __forceinline__ uint32_t completeChunkAlloc(uint32_t count, uint32_t* chunks, uint32_t* chunk_alloc, uint32_t chunk_size, void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, uint32_t* chunk_pointer_pos, OUT_OF_MEM_CALLBACK cb, OUT_OF_CHUNK_POINTER_CALLBACK ccb) +{ + //alloc chunk + uint32_t chunkoff; + int unused_worstCaseRemainder; + if (!allocChunk(count, chunk_alloc, chunk_size, chunkoff, unused_worstCaseRemainder)) + { + chunkoff = 0xFFFFFFFF; + cb(); + } + else + { + //write chunk pointer + uint32_t chunk_pointer_position = atomicAdd(chunk_pointer_alloc, 1); + if (chunk_pointer_position >= chunk_pointer_sizes) + { + chunkoff = 0xFFFFFFFF; + if (chunk_pointer_position == chunk_pointer_sizes) + *chunk_pointer_pos = chunk_pointer_sizes; + ccb(); + } + else + { + chunks_pointers[chunk_pointer_position] = reinterpret_cast(Chunk::cast(chunks, chunkoff)); + } + } + return chunkoff; +} + + + +template +struct alignas(16) DirectChunk : public Chunk +{ + using Chunk::sort_key; + const INDEX_TYPE* indices; + const VALUE_TYPE* values; + LEFT_T multiplier; + + __device__ __forceinline__ DirectChunk(uint32_t num, INDEX_TYPE firstrow, const INDEX_TYPE* indices, const VALUE_TYPE* values, LEFT_T multiplier, ChunkSortType sortkey = 0) : + Chunk(num, firstrow, num, 0xFFFFFFFF, sortkey), + indices(indices), + values(values), + multiplier(multiplier) + { + + } + + __device__ __forceinline__ static uint32_t size(bool nextPointers) + { + uint32_t s = (nextPointers ? 16 : 0) + sizeof(DirectChunk); + return (s + 15) & 0xFFFFFFF0; + } + + __device__ __forceinline__ + static DirectChunk* place(void* chunks, uint32_t offset, uint32_t num, INDEX_TYPE firstrow, const INDEX_TYPE* indices, const VALUE_TYPE* values, LEFT_T multiplier, ChunkSortType sortkey = 0) + { + return new(reinterpret_cast(chunks) + offset) DirectChunk(num, firstrow, indices, values, multiplier, sortkey); + } + __device__ __forceinline__ + static DirectChunk* cast(void* chunks, uint32_t offset) + { + return reinterpret_cast(reinterpret_cast(chunks) + offset); + } + //__device__ __forceinline__ void write(void* location) const + //{ + // *reinterpret_cast(location)[0] = *reinterpret_cast(this)[0]; + // *reinterpret_cast(location)[1] = *reinterpret_cast(this)[1]; + //} + + __device__ __forceinline__ const VALUE_TYPE* values_direct(uint32_t count) + { + return values; + } + __device__ __forceinline__ const INDEX_TYPE* indices_direct(uint32_t count) + { + return indices; + } + + __device__ __forceinline__ const VALUE_TYPE* values_direct(uint32_t count) const + { + return values; + } + __device__ __forceinline__ const INDEX_TYPE* indices_direct(uint32_t count) const + { + return indices; + } + + __device__ __forceinline__ LEFT_T getMultiplier() const + { + return multiplier; + } +}; + +template +__device__ __forceinline__ bool allocDirectChunk(uint32_t* chunk_alloc, uint32_t chunk_size, uint32_t& offset, bool nextPointers = true) +{ + uint32_t s = DirectChunk::size(nextPointers); + offset = atomicAdd(chunk_alloc, s) + (nextPointers ? 16 : 0); + return offset + s <= chunk_size; +} diff --git a/include/GALATIC/include/device/HelperFunctions.cuh b/include/GALATIC/include/device/HelperFunctions.cuh new file mode 100644 index 00000000..6df121ee --- /dev/null +++ b/include/GALATIC/include/device/HelperFunctions.cuh @@ -0,0 +1,877 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * HelperFunctions.cuh + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once + +#include +#include + +#include + +#include +#include + +#include +#include "../meta_utils.h" +#include "../devicetools/event.h" +#include "../MergeCaseOffsets.h" + +namespace +{ + template < + typename IndexType, + typename ConversionOp, + typename OffsetT = ptrdiff_t> + class CustomGeneratorIterator + { + public: + + // Required iterator traits + typedef CustomGeneratorIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef typename ConversionOp::value_type value_type; ///< The type of the element the iterator can point to + //typedef value_type* pointer; ///< pointer not supported + typedef value_type reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + + private: + + ConversionOp conversion_op; + IndexType id; + + public: + + /// Constructor + __host__ __device__ __forceinline__ CustomGeneratorIterator( + ConversionOp conversion_op, ///< Conversion functor to wrap + IndexType base_id = 0) ///< Input id to start at + : + conversion_op(conversion_op), + id(base_id) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + ++id; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + ++id; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return conversion_op(id); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(conversion_op, id + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + id += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(conversion_op, id - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + id -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return id - other.id; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return conversion_op(id + n); + } + + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (id == rhs.id); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (id != rhs.id); + } + }; + + + template < + typename ValueType, + typename Consume, + typename OffsetT = ptrdiff_t> + class CustomOutputConsumerIterator + { + private: + + // Proxy object + struct Reference + { + ValueType* ptr; + Consume consume; + + /// Constructor + __host__ __device__ __forceinline__ Reference(ValueType* ptr, Consume consume) : ptr(ptr), consume(consume) {} + + /// Assignment + __device__ __forceinline__ ValueType operator = (ValueType val) + { + consume(ptr, val); + return val; + } + }; + + public: + + // Required iterator traits + typedef CustomOutputConsumerIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef Reference reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + + private: + + Consume consume; + pointer ptr; + + + public: + + /// Constructor + __host__ __device__ __forceinline__ CustomOutputConsumerIterator( + Consume consume, + pointer ptr = nullptr) ///< Native pointer to wrap + : + consume(consume), + ptr(ptr) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + ptr++; + return retval; + } + + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + ptr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return Reference(ptr, consume); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(consume, ptr + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + ptr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(consume, ptr - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + ptr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return ptr - other.ptr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return Reference(ptr + n, consume); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (ptr == rhs.ptr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (ptr != rhs.ptr); + } + }; + + + template + class CaseCombinerConverter + { + const INDEX_TYPE* const maxPerRowElements; + const uint32_t* const sharedRows; + const uint32_t* const chunkCounter; + public: + + typedef OUT_TYPE value_type; + + __host__ __device__ __forceinline__ + CaseCombinerConverter(const uint32_t* sharedRows, const INDEX_TYPE* maxPerRowElements, const uint32_t* chunkCounter) : + maxPerRowElements(maxPerRowElements), + sharedRows(sharedRows), + chunkCounter(chunkCounter) + { } + + __host__ __device__ __forceinline__ + OUT_TYPE operator()(const uint32_t &id) const + { + uint32_t row = sharedRows[id]; + uint32_t chunks = chunkCounter[row]; + //INDEX_TYPE elementCounter = maxPerRowElements[row]; + int type = 2; + if (chunks == 2 && maxPerRowElements[row] < MergeMaxElements) + type = 0; + else if ((chunks & (~MAX_CHUNKS_CASE)) < MaxMergeChunks && (chunks & CASE_DISTINCTION) == 0 /*&& false*/) + type = 1; + + if (type < OFFSET || type >= OFFSET + NUM) + return 0; + OUT_TYPE res = ((OUT_TYPE(1) << (BITS-1)) | OUT_TYPE(1)) << (BITS*(type - OFFSET)); + /*if (blockIdx.x == 0) + printf("%d (%d) convert %d (%d) to %llx\n", id, sharedRows[id], elementCounter, type, (uint64_t)res);*/ + return res; + } + }; + + template + class CaseSeparatorConsumer + { + INDEX_TYPE* const outputPointers; + INDEX_TYPE* const counters; + INDEX_TYPE* const row_counts; + const uint32_t* const sharedRows; + const uint32_t activeRows; + public: + __host__ __device__ __forceinline__ + CaseSeparatorConsumer(const uint32_t* sharedRows, INDEX_TYPE* outputPointers, INDEX_TYPE* counters, uint32_t activeRows, INDEX_TYPE* row_counts) : + outputPointers(outputPointers), + counters(counters), + sharedRows(sharedRows), + row_counts{ row_counts }, + activeRows(activeRows) + { } + + __host__ __device__ __forceinline__ + void operator()(IN_TYPE* virtualOffset, const IN_TYPE sumresult) const + { + int type = -1; + INDEX_TYPE offset = 0; + const IN_TYPE mask = IN_TYPE(1) << (BITS - 1); + const IN_TYPE select = mask - 1; + + IN_TYPE* virtualBase = nullptr; + uint32_t dist = virtualOffset - virtualBase; + if (dist == activeRows - 1) + { + // final writes counts + #pragma unroll + for (int i = 0; i < NUM; ++i) + { + if (i + OFFSET < 3) + { + INDEX_TYPE sum = static_cast((sumresult >> (i*BITS)) & select); + counters[OFFSET + i] = sum; + } + } + counters[3] = 0; + outputPointers[(activeRows) * 3] = 0; + } + + #pragma unroll + for (int i = 0; i < NUM; ++i) + { + if ((sumresult & (mask << (i*BITS))) != 0) + { + type = i; + offset = static_cast((sumresult >> (i*BITS)) & select); + } + } + if (type == -1) + { + //if(blockIdx.x == 0) + // printf("%d %d: %llx would not write\n", blockIdx.x, threadIdx.x, (uint64_t)sumresult); + return; + } + + type += OFFSET; + //if (blockIdx.x == 0) + // printf("%d %d: %llx would write %d to %d(%d) (%llx, %llx, %d + %d)\n", blockIdx.x, threadIdx.x, (uint64_t)sumresult, offset, dist, type, outputPointers, counters, activeRows, activeRows*type + dist + 1); + //printf("%d %d: %llx writinting %d (%d) to %d (%d*%d + %d -1)\n", blockIdx.x, threadIdx.x, (uint64_t)sumresult, sharedRows[dist], dist, + // (activeRows)*type + offset - 1, activeRows, type,offset); + /*if (type == 0 && row_counts[sharedRows[dist]] > 1024) + printf("RowCount at position %u is : %u\n", sharedRows[dist], row_counts[sharedRows[dist]]);*/ + outputPointers[(activeRows)*type + offset-1] = sharedRows[dist]; + } + }; + + template + struct CombinedAdd + { + /// Boolean max operator, returns (a > b) ? a : b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return (a & MASK) + b; + } + }; + + + struct BlockOffsetRange + { + uint32_t begin, end; + uint32_t count; + }; + + template + class BlockOffsetCreator + { + const INDEX_TYPE* const maxPerRowElements; + const INDEX_TYPE* const sharedRows; + public: + + typedef BlockOffsetRange value_type; + + __host__ __device__ __forceinline__ + BlockOffsetCreator(const INDEX_TYPE* sharedRows, const INDEX_TYPE* maxPerRowElements) : + maxPerRowElements(maxPerRowElements), + sharedRows(sharedRows) + { } + + __host__ __device__ __forceinline__ + BlockOffsetRange operator()(const uint32_t &id) const + { + /*if(maxPerRowElements[sharedRows[id]] > 1024)*/ + //printf("%d creating range: (row %d) %d-%d with %d\n", id, sharedRows[id], id, id + 1, maxPerRowElements[sharedRows[id]]); + return BlockOffsetRange{ id, id + 1, maxPerRowElements[sharedRows[id]] }; + } + }; + + template + struct BlockOffsetCombiner + { + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + //we have to consider the maximum number elements the block can hold (Max Comb) + //and we have to consider the maximum number shared rows the block can handle (SimpleMergeThreads - 1) -> < SimpleMergeThreads + if (a.end == b.begin && a.count + b.count < MaxComb && b.end - a.begin < SimpleMergeThreads) + { + //printf("merging: %d<>%d %d<>%d %d + %d < %d\n", a.begin, a.end, b.begin, b.end, a.count, b.count, MaxComb); + return BlockOffsetRange{ a.begin, b.end, a.count + b.count}; + } + else + { + //printf("not merging: %d<>%d %d<>%d %d + %d < %d\n", a.begin, a.end, b.begin, b.end, a.count, b.count, MaxComb); + } + //if (b.count >= 1024) + // printf("B.count: %u is too large\n", b.count); + return b; + } + }; + + class BlockOffsetExtractor + { + uint2* const rangeOut; + public: + __host__ __device__ __forceinline__ + BlockOffsetExtractor(uint2* rangeOut) : + rangeOut(rangeOut) + { } + + __host__ __device__ __forceinline__ + void operator()(BlockOffsetRange* virtualOffset, const BlockOffsetRange result) const + { + BlockOffsetRange* virtualBase = nullptr; + uint32_t dist = virtualOffset - virtualBase; + /*if(result.count > 1024)*/ + //printf("%d writing range (%d<>%d) %d | %u\n", dist, result.begin, result.end, result.count, result.test); + rangeOut[dist] = uint2{ result.begin, result.end }; + } + }; + + + class RangeStartTranslator + { + const uint2* __restrict__ ranges; + const uint32_t activeRows; + public: + + typedef uint32_t value_type; + + __host__ __device__ __forceinline__ + RangeStartTranslator(const uint2* ranges, uint32_t activeRows) : + ranges(ranges), + activeRows(activeRows) + { } + + __host__ __device__ __forceinline__ + uint32_t operator()(const uint32_t &id) const + { + uint32_t res = 0x80000001; + if (id < activeRows - 1) + { + if (ranges[id].x == ranges[id + 1].x) + res = 0; + } + + //if(res != 0) + // printf("%d is a block end (%d<>%d)\n", id, ranges[id].x, ranges[id].y); + + return res; + } + }; + + template + class BlockStartWriter + { + INDEX_TYPE* const blockOffsets; + INDEX_TYPE* counter; + const uint32_t activeRows; + public: + __host__ __device__ __forceinline__ + BlockStartWriter(INDEX_TYPE* blockOffsets, INDEX_TYPE* counter, uint32_t activeRows) : + blockOffsets(blockOffsets), + counter(counter), + activeRows(activeRows) + { } + + __host__ __device__ __forceinline__ + void operator()(uint32_t* virtualOffset, const uint32_t result) const + { + uint32_t* z = nullptr; + INDEX_TYPE d = virtualOffset - z; + if (d == activeRows - 1) + { + *counter = (result & (~0x80000000)); + } + if (result & 0x80000000) + { + blockOffsets[result & (~0x80000000)] = d+1; + //printf("writing block offset %d : %d\n", result & (~0x80000000), d+1); + } + } + }; + + struct PinnedHostMemDeleter + { + void operator()(void* p) const noexcept + { + cudaFreeHost(p); + } + }; + + template + inline auto allocHostMemory(size_t elements = 1) + { + void* p; + cudaMallocHost(&p, elements * sizeof(T)); + return std::unique_ptr(static_cast(p)); + } +} + +namespace std +{ + template + struct iterator_traits> + { + typedef typename CustomGeneratorIterator::value_type value_type; + }; +} + +template +size_t AcSpGEMMKernels::tempMemSize(size_t CRows) +{ + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + size_t temp_storage_bytes2 = 0; + INDEX_TYPE *in = nullptr, *out = nullptr; + uint64_t *in64 = nullptr, *out64 = nullptr; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in, out, CRows + 1); + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes2, in64, out64, CRows + 1); + temp_storage_bytes = std::max(temp_storage_bytes, temp_storage_bytes2); + + CustomGeneratorIterator> initr(BlockOffsetCreator(nullptr, nullptr)); + CustomOutputConsumerIterator outitr(BlockOffsetExtractor(nullptr)); + cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes2, initr, outitr, BlockOffsetCombiner<1024, 1024>(), CRows); + temp_storage_bytes = std::max(temp_storage_bytes, temp_storage_bytes2); + + CustomGeneratorIterator initr2(RangeStartTranslator(nullptr, 0)); + CustomOutputConsumerIterator> outitr2(BlockStartWriter(nullptr, nullptr, 0)); + cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes2, initr2, outitr2, CombinedAdd(), CRows); + + return std::max(temp_storage_bytes, temp_storage_bytes2) + (CRows * sizeof(uint2) + 15) / 16 * 16 + 16; +} + + template + MergeCaseOffsets AcSpGEMMKernels::assignCombineBlocks(size_t activeRows, void* tempMem, size_t tempMemSize, uint32_t* sharedRows, CUdeviceptr maxPerRowElements, uint32_t* chunckCounter, CUdeviceptr per_block_offsets, CUdeviceptr num_merge_blocks, CUstream stream, CUstream overlapStream) + { + //const INDEX_TYPE MaxBlockLoad = DoubleMaxBlockLoad; + size_t outtempsize = (activeRows * sizeof(uint2) + 15) / 16 * 16; + size_t adjtempsize = tempMemSize - outtempsize; + void* temporaryMem = reinterpret_cast(reinterpret_cast(tempMem) + outtempsize); + + static auto blockCounter = allocHostMemory(4); + + //sum and offsets of merges + if (activeRows < ((1u<<9)-1)) + { + // use 8 bits + CaseCombinerConverter inConv(sharedRows, reinterpret_cast(maxPerRowElements), chunckCounter); + CustomGeneratorIterator> initr(inConv); + + CaseSeparatorConsumer outConv(sharedRows, reinterpret_cast(per_block_offsets), reinterpret_cast(num_merge_blocks), activeRows, reinterpret_cast(maxPerRowElements)); + CustomOutputConsumerIterator> outitr(outConv); + + CombinedAdd comb; + cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr, outitr, comb, activeRows, stream); + } + else if (activeRows < ((1u << 20) - 1)) + { + // use 16 bit + CaseCombinerConverter inConv(sharedRows, reinterpret_cast(maxPerRowElements), chunckCounter); + CustomGeneratorIterator> initr(inConv); + + CaseSeparatorConsumer outConv(sharedRows, reinterpret_cast(per_block_offsets), reinterpret_cast(num_merge_blocks), activeRows, reinterpret_cast(maxPerRowElements)); + CustomOutputConsumerIterator> outitr(outConv); + + CombinedAdd comb; + cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr, outitr, comb, activeRows, stream); + } + else + { + // use 32 bit triple call + CombinedAdd comb; + + CaseCombinerConverter inConv0(sharedRows, reinterpret_cast(maxPerRowElements), chunckCounter); + CustomGeneratorIterator> initr0(inConv0); + + CaseSeparatorConsumer outConv0(sharedRows, reinterpret_cast(per_block_offsets), reinterpret_cast(num_merge_blocks), activeRows, reinterpret_cast(maxPerRowElements)); + CustomOutputConsumerIterator> outitr0(outConv0); + cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr0, outitr0, comb, activeRows, stream); + + CaseCombinerConverter inConv1(sharedRows, reinterpret_cast(maxPerRowElements), chunckCounter); + CustomGeneratorIterator> initr1(inConv1); + + CaseSeparatorConsumer outConv1(sharedRows, reinterpret_cast(per_block_offsets), reinterpret_cast(num_merge_blocks), activeRows, reinterpret_cast(maxPerRowElements)); + CustomOutputConsumerIterator> outitr1(outConv1); + cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr1, outitr1, comb, activeRows, stream); + + CaseCombinerConverter inConv2(sharedRows, reinterpret_cast(maxPerRowElements), chunckCounter); + CustomGeneratorIterator> initr2(inConv2); + + CaseSeparatorConsumer outConv2(sharedRows, reinterpret_cast(per_block_offsets), reinterpret_cast(num_merge_blocks), activeRows, reinterpret_cast(maxPerRowElements)); + CustomOutputConsumerIterator> outitr2(outConv2); + cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr2, outitr2, comb, activeRows, stream); + } + + cudaMemcpy(blockCounter.get(), reinterpret_cast(num_merge_blocks), 3 * sizeof(INDEX_TYPE), cudaMemcpyDeviceToHost); + uint32_t combSharedRows = blockCounter.get()[0]; + + { + BlockOffsetCreator rangeCreator(reinterpret_cast(per_block_offsets), reinterpret_cast(maxPerRowElements)); + CustomGeneratorIterator> initr(rangeCreator); + + BlockOffsetCombiner comb; + + BlockOffsetExtractor rangeExtractor(reinterpret_cast(tempMem)); + CustomOutputConsumerIterator outitr(rangeExtractor); + cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr, outitr, comb, combSharedRows, stream); + } + + { + RangeStartTranslator rangeCreator(reinterpret_cast(tempMem), combSharedRows); + CustomGeneratorIterator initr(rangeCreator); + + CombinedAdd comb; + + BlockStartWriter rangeExtractor(reinterpret_cast(per_block_offsets) + 3 * activeRows, reinterpret_cast(num_merge_blocks) + 3, combSharedRows); + CustomOutputConsumerIterator> outitr(rangeExtractor); + cub::DeviceScan::InclusiveScan(temporaryMem, adjtempsize, initr, outitr, comb, combSharedRows, stream); + } + + cudaMemcpy(blockCounter.get(), reinterpret_cast(num_merge_blocks), 4 * sizeof(INDEX_TYPE), cudaMemcpyDeviceToHost); + + return MergeCaseOffsets(blockCounter.get()[3], blockCounter.get()[1], blockCounter.get()[2], blockCounter.get()[0]); + } + + template + void AcSpGEMMKernels::computeRowOffsets(size_t Crows, void* tempMem, size_t tempMemSize, CUdeviceptr inout, CUstream stream) + { + INDEX_TYPE* workmem = reinterpret_cast(inout); + cub::DeviceScan::ExclusiveSum(tempMem, tempMemSize, workmem, workmem, Crows + 1, stream); + } + +__forceinline__ __device__ unsigned laneid() +{ + unsigned ret; + asm volatile ("mov.u32 %0, %%laneid;" : "=r"(ret)); + return ret; +} + +template +__device__ __forceinline__ void updateMinValue(T &sv, T(&values)[N], int num = N) +{ + typename cub::WarpReduce< T >::TempStorage nosmem; + T v = sv; + #pragma unroll + for (int i = 0; i < N; ++i) + if (i < num) + v = min(v, values[i]); + + T res = cub::WarpReduce< T >(nosmem).Reduce(v, cub::Min()); + if (laneid() == 0) + atomicMin(&sv, res); +} + +template +__device__ __forceinline__ void updateMinValue(T &sv, T v) +{ + typename cub::WarpReduce< T >::TempStorage nosmem; + T res = cub::WarpReduce< T >(nosmem).Reduce(v, cub::Min()); + if (laneid() == 0) + atomicMin(&sv, res); +} + +template +__device__ __forceinline__ void updateMaxValue(T &sv, T v) +{ + typename cub::WarpReduce< T >::TempStorage nosmem; + + T res = cub::WarpReduce< T >(nosmem).Reduce(v, cub::Max()); + if (laneid() == 0) + atomicMax(&sv, res); +} + +template +__device__ __forceinline__ void updateMaxValue(T &sv, T(&values)[N], int num = N) +{ + typename cub::WarpReduce< T >::TempStorage nosmem; + T v = sv; + #pragma unroll + for (int i = 0; i < N; ++i) + if (i < num) + v = max(v, values[i]); + + T res = cub::WarpReduce< T >(nosmem).Reduce(v, cub::Max()); + if (laneid() == 0) + atomicMax(&sv, res); +} + +template +struct count_clz +{ + static const uint32_t value = (X & 0x80000000) ? Completed : static_clz< (X << 1), Completed + 1 >::value; +}; +template +struct count_clz +{ + static const uint32_t value = 32; +}; + +template +struct ChooseBitDataTypeRounded; +template<> +struct ChooseBitDataTypeRounded<8> +{ + using type = uint8_t; +}; +template<> +struct ChooseBitDataTypeRounded<16> +{ + using type = uint16_t; +}; +template<> +struct ChooseBitDataTypeRounded<32> +{ + using type = uint32_t; +}; +template<> +struct ChooseBitDataTypeRounded<64> +{ + using type = uint64_t; +}; + +template +struct ChooseBitDataTypeRounding +{ + using type = typename ChooseBitDataTypeRounded::type; +}; + +template +using ChooseBitDataType = typename ChooseBitDataTypeRounding::type; + + +template +__device__ __forceinline__ +void addPotentiallySharedRow(uint32_t row, Chunk * chunk, bool first_row, + void** output_row_list_heads, uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, bool force_addlist = false) +{ + unsigned long long* rlh = reinterpret_cast(output_row_list_heads); + unsigned long long c = reinterpret_cast(chunk) | (first_row?2:0); + uint64_t next = atomicExch(rlh + row, c); + bool addlist = false; + if (next == 0) + { + if(force_addlist) + addlist = true; + else + { + //we are first, so mark that next needs to add to list + uint64_t set = atomicCAS(rlh + row, c, (c | 0x1)); + if (set != c) + //someone else added to the list before we could mark for setting shared list, so we have to do it + addlist = true; + } + } + else if ((next & 0x1) != 0) + { + addlist = true; + next = next & 0xFFFFFFFFFFFFFFFEULL; + } + + chunk->writeNextPointer(reinterpret_cast*>(next), first_row); + + uint32_t p = static_cast(-1); + if (addlist) + { + p = atomicAdd(shared_rows_alloc, 1); + shared_rows_tracker[p] = row; + } +} + +// ######################################################################## +// Explicit instantiations +// ######################################################################## +template size_t AcSpGEMMKernels::tempMemSize(size_t CRows); +template void AcSpGEMMKernels::computeRowOffsets(size_t Crows, void* tempMem, size_t tempMemSize, CUdeviceptr inout, CUstream stream); +#define GPUCompressedMatrixMatrixMultiplyHelper(THREADS, TEMPPERTHREAD, MERGEMAXCHUNKS) \ +template MergeCaseOffsets AcSpGEMMKernels::assignCombineBlocks(size_t activeRows, void* tempMem, size_t tempMemSize, uint32_t* sharedRows, CUdeviceptr maxPerRowElements, uint32_t* chunckCounter, CUdeviceptr per_block_offsets, CUdeviceptr num_merge_blocks, CUstream stream, CUstream overlapStream); + diff --git a/include/GALATIC/include/device/MultiplyKernels.h b/include/GALATIC/include/device/MultiplyKernels.h new file mode 100644 index 00000000..8bf27fb3 --- /dev/null +++ b/include/GALATIC/include/device/MultiplyKernels.h @@ -0,0 +1,221 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * MultiplyKernels.h + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once + +#include "../SemiRingInterface.h" +#include +#include +#include +#include "../MergeCaseOffsets.h" + +const int RESTART_OFF = 0; +const int RESTART_WRONG_CASE = 1; +const int RESTART_FIRST_ITERATION = 2; +const int RESTART_ITERATION_FINISH = 3; +const int RESTART_ITERATION_UNKNOWN = 4; +const int helper_overhead = 4; +#define WARP_SIZE 32 +#define MAX_CHUNKS_CASE 0x80000000 +#define GENERALIZED_CASE 0xC0000000 +#define CASE_DISTINCTION 0x40000000 // MAX_CHUNKS_CASE - GENERALIZED_CASE + +// Debugging +#define ROW_TO_INVESTIGATE 2579 + +#define ENABLE_SORTING + + + +//################################################### +// Tagged unions / "enums" + +template +struct Either { + union Data { + T tee; + U you; + }; + + Data data; + unsigned char tag; + + __device__ __host__ bool isFirst() const { + return tag == 0; + } + + __device__ __host__ bool isSecond() const { + return tag == 1; + } + + __device__ __host__ const T& valFirst() const { + return data.tee; + } + __device__ __host__ const U& valSecond() const { + return data.you; + } + + static __device__ __host__ Either First(T te) { + Either result; + result.data.tee = te; + result.tag = 0; + return result; + } + static __device__ __host__ Either Second(U u) { + Either result; + result.data.you = u; + result.tag = 1; + return result; + } + __device__ __host__ Either () {} +}; + + + + + +class AcSpGEMMKernels +{ +public: + AcSpGEMMKernels(uint32_t blockDim=128): + blockDim{blockDim} + {} + + void setLaunchDimensions(uint32_t _gridDim, cudaStream_t _stream = 0, uint32_t _blockDim = 128) + { + gridDim = _gridDim; + blockDim = _blockDim; + stream = _stream; + } + + // ##################################################################### + // Determine Block Starts + // + template + void h_DetermineBlockStarts(int num_other, const uint32_t*__restrict offsets, uint32_t* startingIds, uint64_t* toClear, + uint32_t* toClear1, uint32_t* toClear2, int num3, uint32_t* toClear3, int num4, uint32_t* toClear4, + int num5, uint32_t* toClear5, uint32_t* toClear6, uint32_t* toClear7, int num8, uint32_t* toClear8); + + // ##################################################################### + // SpGEMM stage + // + template + void h_computeSpgemmPart( + const typename SEMIRING_t::leftInput_t* valA, const INDEX_TYPE* indicesA, const OFFSET_TYPE* __restrict offsetsA, + /*fixme const T2 -> */const typename SEMIRING_t::rightInput_t* __restrict valB, const INDEX_TYPE* __restrict indicesB, const OFFSET_TYPE* __restrict offsetsB, + const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows, + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + OFFSET_TYPE* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count, + uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv, + uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos, SEMIRING_t semiring); + // ##################################################################### + // Merge Chunks Simple + // + template + void h_mergeSharedRowsSimple(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, + OFFSET_TYPE* output_row_count, + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring); + + // ##################################################################### + // Merge Chunks Max Chunks + // + template + void h_mergeSharedRowsMaxChunks(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, + OFFSET_TYPE* output_row_count, uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, + INDEX_TYPE** restart_chunkIndices, Either* restart_chunkValues, typename SEMIRING_t::leftInput_t* restart_multiplier, uint32_t* restart_chunkElementCount, uint32_t restart_offset, uint32_t* restart_num_chunks, uint32_t* chunk_pointer_pos, SEMIRING_t semiring); + + // ##################################################################### + // Merge Chunks Generalized + // + template + void h_mergeSharedRowsGeneralized(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, + OFFSET_TYPE* output_row_count, + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, + uint32_t* restart_sampleOffs, uint32_t* restart_chunkElementsConsumedAndPath, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring); + + // ##################################################################### + // Copy Chunks into CSR format + // + template< typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE> + void h_copyChunks(void* const* __restrict chunks_pointers, const uint32_t* __restrict chunk_pointer_alloc, + VALUE_TYPE * value_out, INDEX_TYPE * index_out, const uint32_t* __restrict result_offets); + + // ##################################################################### + // Calculate temporary memory size + // + template + size_t tempMemSize(size_t CRows); + + // ##################################################################### + // Merge Case assignment + // + template + MergeCaseOffsets assignCombineBlocks(size_t activeRows, void* tempMem, size_t tempMemSize, uint32_t* sharedRows, CUdeviceptr maxPerRowElements, uint32_t* chunckCounter, CUdeviceptr per_block_offsets, CUdeviceptr num_merge_blocks, CUstream stream = 0, CUstream overlapStream = 0); + + // ##################################################################### + // Compute CSR offsets + // + template + void computeRowOffsets(size_t Crows, void* tempMem, size_t tempMemSize, CUdeviceptr inout, CUstream stream = 0); + + +private: + uint32_t blockDim; + uint32_t gridDim; + cudaStream_t stream; +}; + diff --git a/include/GALATIC/include/device/SortAndCombine.cuh b/include/GALATIC/include/device/SortAndCombine.cuh new file mode 100644 index 00000000..13044dad --- /dev/null +++ b/include/GALATIC/include/device/SortAndCombine.cuh @@ -0,0 +1,209 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * SortAndCombine.cuh + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once +#include +#include + +template +class SortAndCombine +{ + template + class CombinerOp + { + SameElement sameElement; + SameRow sameRow; + SEMIRING_t semiring; + public: + __device__ __forceinline__ CombinerOp(SameElement sameElement, SameRow sameRow, SEMIRING_t semiring) : + sameElement(sameElement), + sameRow(sameRow), + semiring(semiring) + { } + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + //T comb; + //uint32_t ca = a.key.get(); + + //////0x1 means we have to add over + ////if (ca & 0x1) + //// comb.value = b.value; + ////else + //// comb.value = a.value + b.value; + + ////same as above, just without conditional + + //float amul = 1.0f - __int_as_float((ca & 0x1) * __float_as_int(1.0f)); + //comb.value = a.value * amul + b.value; + + //// in case we are at the end of the combine elements, we want to increase both by one + //uint32_t modca = ca + ((ca & 0x1) * 0x00020002); + //// we need to add the parts that are outside of the mask + //uint32_t amask = ((ca & 0x10000) * 0xFFFE) ^ 0xFFFEFFFE; + //// in case a new row starts, we need to reset the front part + //uint32_t res = (modca & amask) + b.key.get(); + + //comb.key = decltype(comb.key)(res); + //return comb; + + + uint32_t newastate = (!sameRow(a.index, b.index)) ? (a.getState() & 0xFFFE) : (a.getState() & 0xFFFEFFFE); + //decltype(a.value) amul = sameElement(a.index, b.index) ? SEMIRING_t::MultiplicativeIdentity() : SEMIRING_t::AdditiveIdentity() ; + return T(b.index, semiring.add( sameElement(a.index, b.index) ? a.value : (SEMIRING_t::AdditiveIdentity()) , b.value), newastate + b.getState()); + + } + }; +public: + class CombResult + { + uint32_t state; + public: + SORTINDEX_TYPE index; + VALUE_TYPE value; + + __device__ __forceinline__ CombResult() = default; + + __device__ __forceinline__ CombResult(SORTINDEX_TYPE index, VALUE_TYPE value, uint32_t state = 0) : + index(index), value(value), state(state) + { } + + __device__ __forceinline__ CombResult(SORTINDEX_TYPE index, VALUE_TYPE value, bool endElement, bool endRow) : + index(index), value(value), state((endRow ? 0x10000 : 0) | (endElement ? 0x20003 : 0)) + { } + + __device__ __forceinline__ uint32_t getState() const + { + return state; + } + __device__ __forceinline__ uint32_t memoffset() const + { + return ((state >> 1) & 0x7FFF) -1; + } + __device__ __forceinline__ uint32_t rowcount() const + { + return state >> 17; + } + __device__ __forceinline__ bool isResult() const + { + return (state & 0x1) != 0; + } + __device__ __forceinline__ bool isRowend() const + { + return ((state >> 16) & 0x1) != 0; + } + }; + + + using CUBCombIndexValueSort = cub::BlockRadixSort; + using ScanCombinerEntry = CombResult; + using CUBScanCombiner = cub::BlockScan; + + union SMem + { + typename CUBCombIndexValueSort::TempStorage combIndexValueSortTempMem; + typename CUBScanCombiner::TempStorage combinerScanTempMem; + SORTINDEX_TYPE threadFirstElementIdentifier[THREADS + 1]; + }; + + template + __device__ __forceinline__ + static uint32_t combine(SMem& smem, + SORTINDEX_TYPE (&combIndex)[ELEMENTS_PER_THREAD], typename SEMIRING_t::output_t (&data)[ELEMENTS_PER_THREAD], ScanCombinerEntry(&combinedEntries)[ELEMENTS_PER_THREAD], + SameElement sameElement, SameRow sameRow, SEMIRING_t semiring, uint32_t sortbits = sizeof(SORTINDEX_TYPE)*8) + { + + //sort according to RowA/ColumnB (together with shared content) + CUBCombIndexValueSort(smem.combIndexValueSortTempMem).Sort(combIndex, data, 0, sortbits); + __syncthreads(); + + + //figure out who has the last element to be combined + smem.threadFirstElementIdentifier[THREADS] = static_cast(-1); + smem.threadFirstElementIdentifier[threadIdx.x] = combIndex[0]; + __syncthreads(); + + + SORTINDEX_TYPE c = combIndex[ELEMENTS_PER_THREAD - 1]; + SORTINDEX_TYPE oc = smem.threadFirstElementIdentifier[threadIdx.x + 1]; + + combinedEntries[ELEMENTS_PER_THREAD - 1] = CombResult(combIndex[ELEMENTS_PER_THREAD - 1], data[ELEMENTS_PER_THREAD - 1], !sameElement(c, oc), !sameRow(c, oc)); + + + #pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD - 1; ++i) + { + SORTINDEX_TYPE c = combIndex[i]; + SORTINDEX_TYPE oc = combIndex[i + 1]; + + combinedEntries[i] = CombResult(combIndex[i], data[i], !sameElement(c, oc), !sameRow(c, oc)); + } + + __syncthreads(); + + + //segmented prefix sum to add up / get mem offset for new data + ScanCombinerEntry accumulate; + CUBScanCombiner(smem.combinerScanTempMem).InclusiveScan(combinedEntries, combinedEntries, CombinerOp(sameElement, sameRow,semiring), accumulate); + //uint32_t outputData = tempData + min(TEMP_PER_THREAD * THREADS, TEMP_PER_THREAD * THREADS + RowelementWorkDistribution::workAvailable(smem.workdistributionMem)); + uint32_t count = accumulate.memoffset() + 1; + + + return count; + } +}; + +template +struct PathMergerOp +{ + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + const T Mask = (1 << PerChunkBits) - 1; + T res = 0; + #pragma unroll + for (uint32_t i = 0; i < MaxChunks; ++i) + { + T tmask = Mask << static_cast(i*PerChunkBits); + res = res | (max(a & tmask, b & tmask)); + } + return res; + } +}; diff --git a/include/GALATIC/include/device/WorkDistribution.cuh b/include/GALATIC/include/device/WorkDistribution.cuh new file mode 100644 index 00000000..9ca72f7a --- /dev/null +++ b/include/GALATIC/include/device/WorkDistribution.cuh @@ -0,0 +1,328 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * ChunkstoCSR.cuh + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once + +#include + + +template +class WorkDistribution +{ +public: + typedef cub::BlockScan SimpleScanT; + + struct SharedMemT + { + int work_sum[THREADS*ELEMENTS_PER_THREAD_IN + 1]; + }; + + using SharedTempMemT = typename SimpleScanT::TempStorage; + + template + struct SharedTempMemOutT + { + int work_offsets[THREADS*MAX_ELEMENTS_PER_THREAD_OUT]; + }; + + + template + __device__ __forceinline__ + static void initialize(SharedMemT& smem, SharedTempMemT& sum_space, int (&thread_work_count)[ELEMENTS_PER_THREAD_IN]) + { + int* work_sum = smem.work_sum; + + if (!BLOCKIN && ELEMENTS_PER_THREAD_IN > 1) + { + //change from interleaved to blocked + #pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i) + work_sum[threadIdx.x + i * THREADS + 1] = thread_work_count[i]; + __syncthreads(); + #pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i) + thread_work_count[i] = work_sum[threadIdx.x * ELEMENTS_PER_THREAD_IN + i + 1]; + } + SimpleScanT(sum_space).InclusiveSum(thread_work_count, thread_work_count); + #pragma unroll + for(int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i) + { + work_sum[threadIdx.x * ELEMENTS_PER_THREAD_IN + i + 1] = thread_work_count[i]; + } + work_sum[0] = 0; + __syncthreads(); + } + + template + __device__ __forceinline__ + static int assignWorkAllThreads(SharedMemT& smem, SharedTempMemT& sum_space, SharedTempMemOutT& tempmem, + int (&work_element_out)[MAX_ELEMENTS_PER_THREAD_OUT], int(&within_element_id)[MAX_ELEMENTS_PER_THREAD_OUT], + int num_distribute = MAX_ELEMENTS_PER_THREAD_OUT*THREADS) + { + int* work_sum = smem.work_sum; + int* work_offsets = tempmem.work_offsets; + + // clear work offsets + #pragma unroll + for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i) + work_offsets[i*THREADS + threadIdx.x] = 0; + + __syncthreads(); + + // compute which thread should start with a given work element + #pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i) + { + int v = work_sum[i*THREADS + threadIdx.x]; + int vn = work_sum[i*THREADS + threadIdx.x + 1]; + if (v < MAX_ELEMENTS_PER_THREAD_OUT*THREADS && v != vn) + work_offsets[v] = i*THREADS + threadIdx.x; + } + + __syncthreads(); + + //compute max per thread elements + num_distribute = min(num_distribute, work_sum[THREADS*ELEMENTS_PER_THREAD_IN]); + + // read my offset (can be the right offset or zero as only the first one will have the right per input element) + #pragma unroll + for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i) + { + //if (MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i < num_distribute) + work_element_out[i] = work_offsets[MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i]; + //else + //work_element_out[i] = 0; + } + + + SimpleScanT(sum_space).InclusiveScan(work_element_out, work_element_out, cub::Max()); + + int outElements = MAX_ELEMENTS_PER_THREAD_OUT; + if (!BLOCKOUT) + { + + __syncthreads(); + + //stripped layout requires another trip through shared.. + #pragma unroll + for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i) + work_offsets[MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i] = work_element_out[i]; + + __syncthreads(); + + // run from back to front so we can just decrese the count iif elements cross thread boundaries (same as below, just with different indices) + #pragma unroll + for (int i = MAX_ELEMENTS_PER_THREAD_OUT-1; i >= 0; --i) + { + if (i*THREADS + threadIdx.x < num_distribute) + { + work_element_out[i] = work_offsets[threadIdx.x + i*THREADS]; + int workoffset = (threadIdx.x + i*THREADS); + within_element_id[i] = work_sum[work_element_out[i] + 1] - workoffset - 1; + + //if ((within_element_id[i] < 0 && i + 1 < outElements) || (workoffset >= num_distribute)) + // outElements = i + 1; + } + else + { + outElements = i; + work_element_out[i] = -1; + within_element_id[i] = -1; + } + } + } + else + { + // run from back to front so we can just decrese the count iif elements cross thread boundaries + #pragma unroll + for (int i = MAX_ELEMENTS_PER_THREAD_OUT - 1; i >= 0; --i) + { + int workoffset = (MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x+i); + within_element_id[i] = work_sum[work_element_out[i] + 1] - workoffset - 1; + if (workoffset >= num_distribute) + outElements = i; + } + } + + __syncthreads(); + + // update counts + #pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i) + { + work_sum[threadIdx.x + i*THREADS + 1] = max(0,work_sum[threadIdx.x + i*THREADS + 1] - num_distribute); + // printf("nwork: %d %d : %d\n", blockIdx.x, threadIdx.x + i*THREADS + 1, work_sum[threadIdx.x + i*THREADS + 1]); + } + + __syncthreads(); + + return outElements; + } + + template + __device__ __forceinline__ + static int assignWorkAllThreads_depricated(SharedMemT& smem, SharedTempMemT& sum_space, SharedTempMemOutT& tempmem, + int(&work_element_out)[MAX_ELEMENTS_PER_THREAD_OUT], int(&within_element_id)[MAX_ELEMENTS_PER_THREAD_OUT], + uint32_t* max_A_entry, uint32_t* max_B_for_max_A_entry, int num_distribute = MAX_ELEMENTS_PER_THREAD_OUT*THREADS) + { + int* work_sum = smem.work_sum; + int* work_offsets = tempmem.work_offsets; + + // clear work offsets +#pragma unroll + for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i) + work_offsets[i*THREADS + threadIdx.x] = 0; + + __syncthreads(); + + // compute which thread should start with a given work element +#pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i) + { + int v = work_sum[i*THREADS + threadIdx.x]; + int vn = work_sum[i*THREADS + threadIdx.x + 1]; + if (v < MAX_ELEMENTS_PER_THREAD_OUT*THREADS && v != vn) + work_offsets[v] = i*THREADS + threadIdx.x; + } + + __syncthreads(); + + //compute max per thread elements + num_distribute = min(num_distribute, work_sum[THREADS*ELEMENTS_PER_THREAD_IN]); + + // read my offset (can be the right offset or zero as only the first one will have the right per input element) +#pragma unroll + for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i) + { + //if (MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i < num_distribute) + work_element_out[i] = work_offsets[MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i]; + //else + //work_element_out[i] = 0; + } + + + SimpleScanT(sum_space).InclusiveScan(work_element_out, work_element_out, cub::Max()); + + int outElements = MAX_ELEMENTS_PER_THREAD_OUT; + if (!BLOCKOUT) + { + + __syncthreads(); + + //stripped layout requires another trip through shared.. +#pragma unroll + for (int i = 0; i < MAX_ELEMENTS_PER_THREAD_OUT; ++i) + work_offsets[MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i] = work_element_out[i]; + + __syncthreads(); + + // run from back to front so we can just decrese the count iif elements cross thread boundaries (same as below, just with different indices) +#pragma unroll + for (int i = MAX_ELEMENTS_PER_THREAD_OUT - 1; i >= 0; --i) + { + if (i*THREADS + threadIdx.x < num_distribute) + { + work_element_out[i] = work_offsets[threadIdx.x + i*THREADS]; + int workoffset = (threadIdx.x + i*THREADS); + within_element_id[i] = work_sum[work_element_out[i] + 1] - workoffset - 1; + + //TODO: needs adjustment for num_distribute + if (max_A_entry && (threadIdx.x == THREADS - 1) && (i == (MAX_ELEMENTS_PER_THREAD_OUT - 1))) + { + // Set max element in A and corresponding max element in B + *max_A_entry = work_element_out[i]; + *max_B_for_max_A_entry = within_element_id[i]; + } + //if ((within_element_id[i] < 0 && i + 1 < outElements) || (workoffset >= num_distribute)) + // outElements = i + 1; + } + else + { + outElements = i; + work_element_out[i] = -1; + within_element_id[i] = -1; + } + } + } + else + { + // run from back to front so we can just decrese the count iif elements cross thread boundaries +#pragma unroll + for (int i = MAX_ELEMENTS_PER_THREAD_OUT - 1; i >= 0; --i) + { + int workoffset = (MAX_ELEMENTS_PER_THREAD_OUT*threadIdx.x + i); + within_element_id[i] = work_sum[work_element_out[i] + 1] - workoffset - 1; + if (workoffset >= num_distribute) + outElements = i; + } + } + + __syncthreads(); + + // update counts +#pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i) + { + work_sum[threadIdx.x + i*THREADS + 1] = max(0, work_sum[threadIdx.x + i*THREADS + 1] - num_distribute); + // printf("nwork: %d %d : %d\n", blockIdx.x, threadIdx.x + i*THREADS + 1, work_sum[threadIdx.x + i*THREADS + 1]); + } + + __syncthreads(); + + return outElements; + } + + __device__ __forceinline__ + static int workAvailable(SharedMemT& smem) + { + //if (threadIdx.x == 0) + // printf("%d work available: %d\n", blockIdx.x, smem.work_sum[ELEMENTS_PER_THREAD_IN*THREADS]); + return const_cast(smem.work_sum)[ELEMENTS_PER_THREAD_IN*THREADS]; + } + __device__ __forceinline__ + static void removework(SharedMemT& smem, int amount) + { + #pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD_IN; ++i) + { + smem.work_sum[threadIdx.x + i*THREADS + 1] = max(0, smem.work_sum[threadIdx.x + i*THREADS + 1] - amount); + } + } +}; \ No newline at end of file diff --git a/include/GALATIC/include/device/acSpGEMM_ChunksToCSR.cuh b/include/GALATIC/include/device/acSpGEMM_ChunksToCSR.cuh new file mode 100644 index 00000000..7a9d4e8c --- /dev/null +++ b/include/GALATIC/include/device/acSpGEMM_ChunksToCSR.cuh @@ -0,0 +1,127 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * ChunkstoCSR.cuh + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once + +#include "MultiplyKernels.h" +#include "Chunk.cuh" + +template< typename VALUE_TYPE, typename INDEX_TYPE, typename OFFSET_TYPE> +__global__ void copyChunks(void* const* __restrict chunks_pointers, const uint32_t* __restrict chunk_pointer_alloc, + VALUE_TYPE * value_out, INDEX_TYPE * index_out, const OFFSET_TYPE* __restrict result_offets) +{ + using Chunk = ::Chunk; + + struct Smem + { + uint32_t chunksize; + uint32_t writeoffset; + const VALUE_TYPE* in_values; + const INDEX_TYPE* in_indices; + }; + + __shared__ Smem smem; + + uint32_t counter = blockIdx.x; + + while (counter < *chunk_pointer_alloc) + { + if(threadIdx.x == 0) + { + const Chunk* chunk = reinterpret_cast(chunks_pointers[counter]); + uint32_t chunksize = chunk->num_entries; + const VALUE_TYPE* in_values = chunk->values_direct(chunksize); + const INDEX_TYPE* in_indices = chunk->indices_direct(chunksize); + uint32_t firstrow = chunk->firstrow; + + uint32_t startingOffset = chunk->startingoffset(); + if(startingOffset == 0) + { + if (chunk->firstConsumed()) + { + uint32_t firstoffset = chunk->firstCountCleared(); + chunksize -= firstoffset; + in_values += firstoffset; + in_indices += firstoffset; + ++firstrow; + } + if (chunk->lastConsumed() && !chunk->isDirect()) + chunksize -= chunk->lastCountCleared(); + } + + smem.chunksize = chunksize; + smem.in_values = in_values; + smem.in_indices = in_indices; + + //special case for multiple chunk rows (need offset for writing!) + smem.writeoffset = startingOffset + result_offets[firstrow]; + } + __syncthreads(); + + //write out + for (uint32_t i = threadIdx.x; i < smem.chunksize; i += blockDim.x) + { + value_out[smem.writeoffset + i] = smem.in_values[i]; + index_out[smem.writeoffset + i] = smem.in_indices[i]; + } + + counter += gridDim.x; + } + +} + +template +void AcSpGEMMKernels::h_copyChunks(void* const* __restrict chunks_pointers, const uint32_t* __restrict chunk_pointer_alloc, VALUE_TYPE * value_out, INDEX_TYPE * index_out, const uint32_t* __restrict result_offets) +{ + int blockSize(256); + + static size_t copyBlocksOnGPU = 0; + if (copyBlocksOnGPU == 0) + { + CUdevice dev; + cudaGetDevice(&dev); + int occ, sm; + void(*ptr)(void* const* __restrict, const uint32_t* __restrict, VALUE_TYPE *, INDEX_TYPE * index_out, const uint32_t* __restrict) = copyChunks< VALUE_TYPE, INDEX_TYPE, OFFSET_TYPE>; + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occ, ptr, blockSize, 0); + cudaDeviceGetAttribute(&sm, cudaDevAttrMultiProcessorCount, dev); + copyBlocksOnGPU = sm*occ; + } + copyChunks <<>>(chunks_pointers, chunk_pointer_alloc, value_out, index_out, result_offets); +} diff --git a/include/GALATIC/include/device/acSpGEMM_DetermineBlockStarts.cuh b/include/GALATIC/include/device/acSpGEMM_DetermineBlockStarts.cuh new file mode 100644 index 00000000..5a185112 --- /dev/null +++ b/include/GALATIC/include/device/acSpGEMM_DetermineBlockStarts.cuh @@ -0,0 +1,113 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * DetermineBlockStarts.cuh + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once + +#include "MultiplyKernels.h" +#include "../common.h" + + +template +__global__ void DetermineBlockStarts(int num_other, const OFFSET_TYPE*__restrict offsets, uint32_t* startingIds, + uint64_t* toClear, uint32_t* toClear1, uint32_t* toClear2, int num3, uint32_t* toClear3, int num4, uint32_t* toClear4, + int num5, uint32_t* toClear5, uint32_t* toClear6, uint32_t* toClear7, int num8, uint32_t* toClear8) +{ + int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id > num_other) + return; + + int a = offsets[id]; + int b = offsets[min(id + 1, num_other)]; + + int blocka = divup(a, NNZ_PER_BLOCK); + int blockb = (b - 1) / static_cast(NNZ_PER_BLOCK); + + //iterate over all blocks that start with that row + for (; blocka <= blockb; ++blocka) + startingIds[blocka] = id; + + //write last + if (id == num_other) + startingIds[divup(b, NNZ_PER_BLOCK)] = id - 1; + else + { + toClear[id] = 0, + toClear1[id] = 0; + } + toClear2[id] = 0; + + for (int i = id; i < num3; i+=num_other) + { + toClear3[i] = 0; + } + + for (int i = id; i < num4; i += num_other) + { + toClear4[i] = 0; + } + + for (int i = id; i < num5; i += num_other) + { + toClear5[i] = 0; + toClear6[i] = 0; + //toClear7[i] = 0; + } + + for (int i = id; i < num8; i += num_other) + { + toClear8[i] = 0; + } +} + +template +void AcSpGEMMKernels::h_DetermineBlockStarts(int num_other, const uint32_t*__restrict offsets, uint32_t* startingIds, uint64_t* toClear, uint32_t* toClear1, uint32_t* toClear2, int num3, uint32_t* toClear3, int num4, uint32_t* toClear4, + int num5, uint32_t* toClear5, uint32_t* toClear6, uint32_t* toClear7, int num8, uint32_t* toClear8) +{ + // This method has a tendency to access memory illegally + DetermineBlockStarts <<>>(num_other, offsets, startingIds, toClear, toClear1, toClear2, num3, toClear3, + num4, toClear4, + num5, toClear5, toClear6, toClear7, + num8, toClear8); +} + + +#define GPUCompressedMatrixMatrixMultiplyBlockStarts(THREADS, NNZPERTHREAD) \ + template void AcSpGEMMKernels::h_DetermineBlockStarts(int num_other, const uint32_t*__restrict offsets, uint32_t* startingIds, uint64_t* toClear, uint32_t* toClear1, uint32_t* toClear2, int num3, uint32_t* toClear3, int num4, uint32_t* toClear4, int num5, uint32_t* toClear5, uint32_t* toClear6, uint32_t* toClear7, int num8, uint32_t* toClear8); + diff --git a/include/GALATIC/include/device/acSpGEMM_MergeGeneralized.cuh b/include/GALATIC/include/device/acSpGEMM_MergeGeneralized.cuh new file mode 100644 index 00000000..dde2f78c --- /dev/null +++ b/include/GALATIC/include/device/acSpGEMM_MergeGeneralized.cuh @@ -0,0 +1,738 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * MergeGeneralized.cuh + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once + +#include "MultiplyKernels.h" + +#define ELEMENT_TO_SEARCH 10198 + +//binary search for an element in an array; returns the number of elements that are smaller or equal than the one +// we are looking for +template +__device__ __forceinline__ SIZE_TYPE binarySearch(const DATA_TYPE* start, const SIZE_TYPE count, const DATA_TYPE target) +{ + if (count == 0) + return 0; + + SIZE_TYPE lower_bound = 0; + SIZE_TYPE upper_bound = count - 1; + SIZE_TYPE tmp_loc; + + if (target < start[lower_bound]) + return 0; + + if (target > start[count - 1]) + return count; + + while (lower_bound <= upper_bound) + { + tmp_loc = (lower_bound + upper_bound) >> 1; + + if (target < start[tmp_loc]) + { + upper_bound = tmp_loc - 1; + } + else if (target > start[tmp_loc]) + { + lower_bound = tmp_loc + 1; + } + else + { + //we can have multiple target entries - let's skip them until we point after the last target + while (tmp_loc < count && start[tmp_loc] == target) + ++tmp_loc; + + return tmp_loc; + } + } + + return lower_bound; //element not found; return id of first element larger than target +} + +// samples the interval [lower, upper] s.t. each of the num_samples sub intervals is approximately the same size +template +__device__ __forceinline__ INDEX_TYPE getSample(INDEX_TYPE lower, INDEX_TYPE upper, uint32_t num_samples, uint32_t sample_point) +{ + float alpha = static_cast(sample_point + 1) / num_samples; + return (1 - alpha) * lower + alpha * upper; +} + +template +__device__ __forceinline__ uint32_t samplePosition(uint32_t minID, uint32_t maxID, int position = threadIdx.x + 1) +{ + return (divup((maxID - minID), THREADS)) * (position); +} + +template +__device__ __forceinline__ uint32_t sampling(typename cub::BlockScan::TempStorage& atomicMaxScanTemp, + INDEX_TYPE minID, INDEX_TYPE maxID, int numberChunks, uint32_t* max_sampling_category, + uint32_t (&sample_offsets)[THREADS], const INDEX_TYPE *__restrict__(&chunkIndices)[MERGE_MAX_CHUNKS], uint32_t (&chunkElementCount)[MERGE_MAX_CHUNKS]) +{ + uint32_t sampling_step = divup((maxID - minID), THREADS); + uint32_t my_sample_offset = 0; + for (auto round = 0; round < numberChunks; ++round) + { + // Reset intermediary offset + if (threadIdx.x == 0) + *max_sampling_category = 0; + sample_offsets[threadIdx.x] = 0; + __syncthreads(); + uint32_t count = chunkElementCount[round]; + for (int i = threadIdx.x; i < count - 1; i += THREADS) + { + // Fetch column Ids + INDEX_TYPE columnIndex = chunkIndices[round][i]; + INDEX_TYPE nextColumnIndex = chunkIndices[round][i + 1]; + INDEX_TYPE sampling_category = (columnIndex > 0) ? (columnIndex - 1) / sampling_step : 0; + INDEX_TYPE next_sampling_category = (nextColumnIndex - 1) / sampling_step; + if (sampling_category != next_sampling_category) + { + if (sampling_category < THREADS) + sample_offsets[sampling_category] = i + 1; + atomicMax(max_sampling_category, sampling_category); + } + } + __syncthreads(); + + // Set max + if (*max_sampling_category < (THREADS - 1)) + sample_offsets[*max_sampling_category + 1] = count; + __syncthreads(); + + uint32_t sample_value[1] = { sample_offsets[threadIdx.x] }; + // Propagate Max + cub::BlockScan(atomicMaxScanTemp).InclusiveScan(sample_value, sample_value, cub::Max()); + __syncthreads(); + // Write to global sample offsets + my_sample_offset += sample_value[0]; + __syncthreads(); + } + return my_sample_offset; +} + + +const int GlobalPathOffset = 0; +const int MinColumnOffset = 1; +const int MaxColumnOffset = 2; +const int ElementsHandledOffset = 3; + +// ######################################################################################### +// +// Generalized Case +// +// ######################################################################################### +template +__global__ void __launch_bounds__(THREADS, BLOCKS_PER_MP) +mergeSharedRowsGeneralized(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, + OFFSET_TYPE* output_row_count, + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, + uint32_t* restart_sampleOffs, uint32_t* restart_chunkElementsConsumedAndPath, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring) +{ + static_assert(2 * INPUT_ELEMENTS_PER_THREAD * THREADS >= MERGE_MAX_CHUNKS, "Too many elements per column possible now!"); + + using Chunk = ::Chunk; + + using DirectChunk = ::DirectChunk; + + const uint32_t ELEMENTS_PER_THREAD = 2 * INPUT_ELEMENTS_PER_THREAD; + using SortAndCombiner = SortAndCombine; + using ScanCombinerEntry = typename SortAndCombiner::ScanCombinerEntry; + typedef cub::BlockScan SimpleScanT; + const uint32_t LengthSamplesPerThread = (MERGE_MAX_CHUNKS + THREADS - 1) / THREADS; + using SingleLoadWorkDistribution = WorkDistribution; + using IndexSorter = cub::BlockRadixSort; + + using LEFT_T = typename SEMIRING_t::leftInput_t; + using RIGHT_t = typename SEMIRING_t::rightInput_t; + using OUT_t = typename SEMIRING_t::output_t; + + struct SMem + { + uint32_t runflag, restart/*, max_sampling_category*/; + uint32_t numSharedRow; + int numChunks; + INDEX_TYPE maxColumnIdRow, currentMinColumnIdRow, currentMaxColumnIdRow; + int sumOut; + uint32_t completed; + uint32_t longChunkOffset; + INDEX_TYPE globalPath; + INDEX_TYPE elementsHandled; + + + + const INDEX_TYPE* __restrict chunkIndices[MERGE_MAX_CHUNKS]; + Either chunkValues[MERGE_MAX_CHUNKS]; + LEFT_T multiplier[MERGE_MAX_CHUNKS]; + uint32_t chunkElementCount[MERGE_MAX_CHUNKS]; + INDEX_TYPE sample_offsets[THREADS]; + INDEX_TYPE elementsInChunkConsumed[MERGE_MAX_CHUNKS]; + uint32_t current_path_elements[MERGE_MAX_CHUNKS]; + + // Used for sorting + uint32_t indexing[MERGE_MAX_CHUNKS]; + + union { + struct + { + ChunkSortType sort_keys[MERGE_MAX_CHUNKS]; + typename IndexSorter::TempStorage indexptrtempmem; + }; + + struct { + typename SingleLoadWorkDistribution::SharedMemT single_workdistributionMem; + typename SingleLoadWorkDistribution::SharedTempMemT single_workdistributionTempMem; + typename SingleLoadWorkDistribution:: template SharedTempMemOutT single_workdistributionTempMemOutFull; + }; + + typename SortAndCombiner::SMem single_sAndCMem; + + struct { + typename SEMIRING_t::output_t longOutDataBuffer[THREADS]; + INDEX_TYPE longOutIndexBuffer[THREADS]; + }; + }; + + }; + + __shared__ SMem smem; + + //determine the block's offset + if (threadIdx.x == 0) + { + uint32_t shared_handled = shared_rows_handled[(blockIdx.x + restart_offset)]; + smem.numSharedRow = 1 - shared_handled; + smem.runflag = *run_flag; + smem.restart = restart_completion[(blockIdx.x + restart_offset)]; + smem.sumOut = (smem.restart > RESTART_FIRST_ITERATION) ? output_row_count[sharedRows[blockIdx.x]] : 0; + } + __syncthreads(); + + // Already handled + if (smem.numSharedRow == 0) + return; + + __syncthreads(); + + if (threadIdx.x == 0) + { + //Get the one chunk that has elements of the block's row + uint64_t chunk = reinterpret_cast(output_row_list_heads[sharedRows[blockIdx.x]]); + // DEBUG + // if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE) + // printf("Row %d in Generalized\n", sharedRows[blockIdx.x]); + // DEBUG + uint32_t chunk_counter = 0; + + smem.currentMinColumnIdRow = std::numeric_limits::max(); + smem.maxColumnIdRow = 0; + + //As long as we have some chunk that has elements of the block's row keep reading + while (chunk != 0) + { + INDEX_TYPE minColumnId, maxColumnId; + bool first_row = (chunk & 2) != 0; + //get a pointer to the current chunk + Chunk* __restrict pChunk = reinterpret_cast(chunk & 0xFFFFFFFFFFFFFFFCULL); + uint32_t count; + const INDEX_TYPE* pIndices; + Either pValues; + int32_t numentries = pChunk->num_entries; + LEFT_T multiplier; + + smem.sort_keys[chunk_counter] = pChunk->sort_key; + + if (first_row) + { + // only first_row chunks can be direct ones + if (pChunk->isDirect()) + { + DirectChunk* __restrict pDirectChunk = reinterpret_cast(pChunk); + count = numentries; + pIndices = pDirectChunk->indices_direct(numentries); + pValues = Either::First(pDirectChunk->values_direct(numentries)); + multiplier = pDirectChunk->getMultiplier(); + chunk = reinterpret_cast(pChunk->readNextFront()); + pDirectChunk->setFirstConsumed(); + minColumnId = pIndices[0]; + maxColumnId = pIndices[count - 1]; + } + else + { + count = pChunk->firstCountCleared(); + pChunk->setFirstConsumed(); + pIndices = pChunk->indices_direct(numentries); + pValues = Either::Second(pChunk->values_direct(numentries)); + minColumnId = pIndices[0]; + maxColumnId = pIndices[count - 1]; + chunk = reinterpret_cast(pChunk->readNextFront()); + } + } + else + { + count = pChunk->lastCountCleared(); + pChunk->setLastConsumed(); + uint32_t baseoffset = numentries - count; + pIndices = pChunk->indices_direct(numentries) + baseoffset; + pValues = Either::Second(pChunk->values_direct(numentries) + baseoffset); + minColumnId = pIndices[0]; + maxColumnId = pIndices[count - 1]; + chunk = reinterpret_cast(pChunk->readNextBack()); + } + + //Update global min/max column id + smem.currentMinColumnIdRow = min(smem.currentMinColumnIdRow, minColumnId); + smem.maxColumnIdRow = max(smem.maxColumnIdRow, maxColumnId); + smem.currentMaxColumnIdRow = smem.maxColumnIdRow; + + // We do not have enough memory to store more chunk info + if (chunk_counter >= MERGE_MAX_CHUNKS) + { + printf("ERROR: number of chunks (%d) exceeds maximum (%d) in block: %u;\n", chunk_counter, MERGE_MAX_CHUNKS, blockIdx.x); + __trap(); + smem.runflag = 1; + break; + } + else + { + smem.chunkIndices[chunk_counter] = pIndices; + + smem.chunkValues[chunk_counter] = pValues; + smem.chunkElementCount[chunk_counter] = count; + smem.multiplier[chunk_counter] = multiplier; + } + + ++chunk_counter; + } + smem.numChunks = chunk_counter; + } + __syncthreads(); + + if (smem.runflag != 0) + return; + + // Sort chunks + { + ChunkSortType key[LengthSamplesPerThread]; + uint32_t value[LengthSamplesPerThread]; + for (int i = threadIdx.x; i < MERGE_MAX_CHUNKS; i += THREADS) + { + value[i / THREADS] = i; +#ifdef ENABLE_SORTING + if(i < smem.numChunks) + key[i/THREADS] = smem.sort_keys[i]; + else + key[i / THREADS] = 0xFFFFFFFF; +#endif + } +#ifdef ENABLE_SORTING + IndexSorter(smem.indexptrtempmem).Sort(key, value); +#endif + for (int i = threadIdx.x; i < MERGE_MAX_CHUNKS; i += THREADS) + { + smem.indexing[(threadIdx.x*LengthSamplesPerThread) + (i / THREADS)] = value[i / THREADS]; + //smem.indexing[i] = i; + } + } + __syncthreads(); + + int chunkWorkElements[LengthSamplesPerThread]; + //Perform the sampling + if (smem.restart < RESTART_FIRST_ITERATION) + { + //determine for each thread which column id he has to look for in the chunks + uint32_t sample = getSample(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, THREADS, threadIdx.x); + + //warp based sampling in rounds; in round r thread i works on chunk (i+r) % n + INDEX_TYPE my_sample_offset = 0; + int wid = threadIdx.x / 32; + for (auto round = 0; round < smem.numChunks; ++round) + { + uint32_t count = smem.chunkElementCount[smem.indexing[(wid + round) % smem.numChunks]]; + const INDEX_TYPE* pIndices = smem.chunkIndices[smem.indexing[(wid + round) % smem.numChunks]]; + //perform binary search for sample in [pIndices, pIndices + count) and accumulate sample_locations + my_sample_offset += binarySearch(pIndices, count, sample); + } + + //uint32_t my_sample_offset = sampling(smem.atomicMaxScanTemp, smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, smem.numChunks, &(smem.max_sampling_category), smem.sample_offsets, smem.chunkIndices, smem.chunkElementCount); + + //write the threads sample offset to shared + smem.sample_offsets[threadIdx.x] = my_sample_offset; + restart_sampleOffs[blockIdx.x * THREADS + threadIdx.x] = my_sample_offset; + if (threadIdx.x == 0) + { + restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MinColumnOffset] = smem.currentMinColumnIdRow; + restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MaxColumnOffset] = smem.currentMaxColumnIdRow; + } + } + //We already restarted at least once and have done at least one iteration in the last run, hence, we have values that we want to reuse + else + { + smem.sample_offsets[threadIdx.x] = restart_sampleOffs[blockIdx.x * THREADS + threadIdx.x]; + if (threadIdx.x == 0) + { + smem.currentMinColumnIdRow = restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MinColumnOffset]; + smem.currentMaxColumnIdRow = restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MaxColumnOffset]; + } + } + + for (int i = threadIdx.x; i < MERGE_MAX_CHUNKS; i += THREADS) + { + smem.elementsInChunkConsumed[i] = restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + i]; + } + + __syncthreads(); + + if (threadIdx.x == 0) + { + smem.globalPath = restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + GlobalPathOffset]; + smem.elementsHandled = restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + ElementsHandledOffset]; + smem.restart = RESTART_FIRST_ITERATION; + } + + //we want to wait here s.t. e.g. smem.sample_offsets is available + __syncthreads(); + + bool sampling_required{ false }; + while (true) + { + // Maybe resampling is required + if (sampling_required) + { + if (threadIdx.x == 0) + { + uint32_t minColumnIdRow = smem.currentMinColumnIdRow; + + if (smem.globalPath > 0 && smem.globalPath != static_cast(-1)) + { + smem.currentMinColumnIdRow = getSample(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, THREADS, smem.globalPath - 1); + //smem.currentMinColumnIdRow = samplePosition(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, smem.globalPath - 1); + } + + if (minColumnIdRow == smem.currentMinColumnIdRow && smem.globalPath != static_cast(-1)) + { + smem.currentMaxColumnIdRow = (smem.currentMinColumnIdRow + smem.currentMaxColumnIdRow) >> 1; + } + + smem.globalPath = 0; + restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MinColumnOffset] = smem.currentMinColumnIdRow; + restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + MaxColumnOffset] = smem.currentMaxColumnIdRow; + restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + GlobalPathOffset] = smem.globalPath; + } + __syncthreads(); + sampling_required = false; + + //determine for each thread which column id he has to look for in the chunks + uint32_t sample = getSample(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, THREADS, threadIdx.x); + + //warp based sampling in rounds; in round r thread i works on chunk (i+r) % n + INDEX_TYPE my_sample_offset = 0; + int wid = threadIdx.x / 32; + for (auto round = 0; round < smem.numChunks; ++round) + { + uint32_t count = smem.chunkElementCount[smem.indexing[(wid + round) % smem.numChunks]]; + const INDEX_TYPE* pIndices = smem.chunkIndices[smem.indexing[(wid + round) % smem.numChunks]]; + //perform binary search for sample in [pIndices, pIndices + count) and accumulate sample_locations + my_sample_offset += binarySearch(pIndices, count, sample); + } + //write the threads sample offset to shared + smem.sample_offsets[threadIdx.x] = my_sample_offset; + + //uint32_t my_sample_offset = sampling(smem.atomicMaxScanTemp, smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, smem.numChunks, &(smem.max_sampling_category), smem.sample_offsets, smem.chunkIndices, smem.chunkElementCount); + restart_sampleOffs[blockIdx.x * THREADS + threadIdx.x] = my_sample_offset; + } + __syncthreads(); + + //Decide where to perform the next cut; how many elements/columns do we want to handle now? + // after this the variables are updated to hold the new path [start sample id, end sample id) + bool path_boundary = false; + bool last_path = false; + //check whether we can handle all remaining columns now; this would be the last path + if (smem.sample_offsets[THREADS - 1] - smem.elementsHandled <= ELEMENTS_PER_THREAD * THREADS) + { + if (threadIdx.x == THREADS - 1) + last_path = true; + } + else + { + path_boundary = threadIdx.x >= smem.globalPath && threadIdx.x < THREADS - 1 && + smem.sample_offsets[threadIdx.x] - smem.elementsHandled <= ELEMENTS_PER_THREAD * THREADS && + smem.sample_offsets[threadIdx.x + 1] - smem.elementsHandled > ELEMENTS_PER_THREAD * THREADS && + smem.sample_offsets[threadIdx.x] - smem.elementsHandled != 0; + } + + // If no path can be chosen as any are too large to be handled -> resample + sampling_required = __syncthreads_and(!path_boundary && !last_path); + if (sampling_required) + continue; + + //the thread with the id of the last column that should be handled updates the global path boundaries + if (path_boundary || last_path) + { + smem.globalPath = threadIdx.x + 1; //first sample id *not* in the current path + smem.completed = last_path; + } + __syncthreads(); + + //For each chunk: determine cutoff id using a binary search aka. determine local path + for(int i = 0; i < LengthSamplesPerThread; ++i) + chunkWorkElements[i] = 0; + for (int chunk = threadIdx.x; chunk < smem.numChunks; chunk += THREADS) + { + const INDEX_TYPE* pIndices = smem.chunkIndices[smem.indexing[chunk]]; + uint32_t count = smem.chunkElementCount[smem.indexing[chunk]]; + //how much of this chunk did we already consume? This is at the same time the start of the next local path; + const uint32_t prev_cutoff = smem.elementsInChunkConsumed[chunk]; + + //determine how many elements of this chunk are part of the current path + uint32_t look_for = getSample(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, THREADS, smem.globalPath - 1); + //uint32_t look_for = samplePosition(smem.currentMinColumnIdRow, smem.currentMaxColumnIdRow, smem.globalPath - 1); + + smem.current_path_elements[chunk] = (count > prev_cutoff) ? binarySearch(pIndices + prev_cutoff, count - prev_cutoff, look_for) : 0; + //update the number of consumed elements for each chunk + smem.elementsInChunkConsumed[chunk] += smem.current_path_elements[chunk]; + //how many elements to handle in this chunk in the current path + chunkWorkElements[chunk / THREADS] = smem.current_path_elements[chunk]; + } + __syncthreads(); + + SingleLoadWorkDistribution:: template initialize(smem.single_workdistributionMem, smem.single_workdistributionTempMem, chunkWorkElements); + + int chunk[ELEMENTS_PER_THREAD]; + int element[ELEMENTS_PER_THREAD]; + + int elements = SingleLoadWorkDistribution:: template assignWorkAllThreads( + smem.single_workdistributionMem, smem.single_workdistributionTempMem, smem.single_workdistributionTempMemOutFull, + chunk, element); + + //combine entries of the current path in shared and write them into global + int numOut; + // Combine entries + ScanCombinerEntry combinedEntries[ELEMENTS_PER_THREAD]; + { + uint32_t combIndex[ELEMENTS_PER_THREAD]; + typename SEMIRING_t::output_t data[ELEMENTS_PER_THREAD]; +#pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD; ++i) + { + if (element[i] >= 0) + { + const INDEX_TYPE* __restrict ip = smem.chunkIndices[smem.indexing[chunk[i]]] + smem.elementsInChunkConsumed[chunk[i]] - smem.current_path_elements[chunk[i]]; + combIndex[i] = ip[element[i]]; + + if ( smem.chunkValues[smem.indexing[chunk[i]]].isFirst()) { + const RIGHT_t* dp = smem.chunkValues[smem.indexing[chunk[i]]].valFirst() + smem.elementsInChunkConsumed[chunk[i]] - smem.current_path_elements[chunk[i]]; + data[i] = semiring.multiply(smem.multiplier[smem.indexing[chunk[i]]], dp[element[i]]); + + } else { + const OUT_t* dp = smem.chunkValues[smem.indexing[chunk[i]]].valSecond() + smem.elementsInChunkConsumed[chunk[i]] - smem.current_path_elements[chunk[i]]; + data[i] = dp[element[i]]; + } + } + else + { + data[i] = SEMIRING_t::AdditiveIdentity(); + combIndex[i] = static_cast(-1); + } + } + __syncthreads(); + + numOut = SortAndCombiner::combine(smem.single_sAndCMem, combIndex, data, combinedEntries, + [](auto a, auto b) { + return a == b; + }, + [](auto a, auto b) { + return true; + }, semiring); + // ######## DEBUG + //if (numOut == 0 && threadIdx.x == 0) + //{ + // printf("%d %d oops in generalized\n", blockIdx.x, threadIdx.x); + //} + // ######## DEBUG + } + + // create new chunk (could also reuse old ones if completely used up...?) + if (threadIdx.x == 0) + { + uint32_t chunkoff; + int ignored; + if (!allocChunk(numOut, chunk_alloc, chunk_size, chunkoff, ignored, false)) + { + chunkoff = static_cast(-1); + atomicOr(run_flag, 0x1); + // Write restart state + restart_completion[(blockIdx.x + restart_offset)] = smem.restart; + } + else + { + //need to add flag and offset for copy later (offset = s) + uint32_t s = smem.sumOut; + INDEX_TYPE actualrow = sharedRows[blockIdx.x]; + //write chunk pointer + uint32_t chunk_pointer_position = atomicAdd(chunk_pointer_alloc, 1); + if (chunk_pointer_position >= chunk_pointer_sizes) + { + chunkoff = static_cast(-1); + atomicOr(run_flag,0x2); + if (chunk_pointer_position == chunk_pointer_sizes) + { + *chunk_pointer_pos = chunk_pointer_sizes; + } + restart_completion[(blockIdx.x + restart_offset)] = smem.restart; + } + else + { + //FIXME SUSPICIOUS LINE april 25 + chunks_pointers[chunk_pointer_position] = reinterpret_cast(Chunk::place(chunks, chunkoff, numOut, actualrow, Chunk::StartingOffsetFlag | s, 0)); + //write row count + s += numOut; + smem.sumOut = s; + output_row_count[actualrow] = s; + } + } + smem.longChunkOffset = chunkoff; + } + __syncthreads(); + + if (smem.longChunkOffset == static_cast(-1)) + { + return; + } + + //loop over data and write out + for (uint32_t written = 0; written < numOut; written += THREADS) + { + //store in shared for coalesced out +#pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD; ++i) + { + uint32_t poffset = combinedEntries[i].memoffset(); + if (combinedEntries[i].isResult() && + poffset >= written && poffset < written + THREADS) + { + uint32_t pwrite = poffset - written; + + smem.longOutDataBuffer[pwrite] = combinedEntries[i].value; + smem.longOutIndexBuffer[pwrite] = combinedEntries[i].index; + } + } + __syncthreads(); + + //write outg + if (written + threadIdx.x < numOut) + { + typename SEMIRING_t::output_t* valstart = Chunk::cast(chunks, smem.longChunkOffset)->values_direct(numOut); + INDEX_TYPE* indexstart = Chunk::cast(chunks, smem.longChunkOffset)->indices_direct(numOut); + + valstart[written + threadIdx.x] = smem.longOutDataBuffer[threadIdx.x]; + indexstart[written + threadIdx.x] = smem.longOutIndexBuffer[threadIdx.x]; + } + __syncthreads(); + } + for (int i = threadIdx.x; i < MERGE_MAX_CHUNKS; i += THREADS) + { + restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + i] = smem.elementsInChunkConsumed[i]; + } + if (threadIdx.x == 0) + { + smem.elementsHandled = smem.sample_offsets[smem.globalPath - 1]; //update path start (first sample id in the path) + restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + GlobalPathOffset] = smem.globalPath; + restart_chunkElementsConsumedAndPath[blockIdx.x * (MERGE_MAX_CHUNKS + helper_overhead) + MERGE_MAX_CHUNKS + ElementsHandledOffset] = smem.elementsHandled; + } + __syncthreads(); + + // Work is done, we can stop now + if (smem.completed) + { + if(smem.currentMaxColumnIdRow == smem.maxColumnIdRow) + break; + + __syncthreads(); + + if (threadIdx.x == 0) + { + smem.globalPath = static_cast(-1); + smem.currentMinColumnIdRow = smem.currentMaxColumnIdRow + 1; + smem.currentMaxColumnIdRow = smem.maxColumnIdRow; + } + sampling_required = true; + } + + smem.restart = RESTART_ITERATION_UNKNOWN; + __syncthreads(); + } + + // This row is done + if (threadIdx.x == 0) + { + shared_rows_handled[(blockIdx.x + restart_offset)] = 1; + } +} + +template + void AcSpGEMMKernels::h_mergeSharedRowsGeneralized(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, + OFFSET_TYPE* output_row_count, + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, + uint32_t* restart_sampleOffs, uint32_t* restart_chunkElementsConsumedAndPath, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring) +{ + mergeSharedRowsGeneralized<<>>( + blockOffsets, sharedRows, output_row_list_heads, output_row_count, chunks, chunk_alloc, chunk_pre_alloc, chunk_size, + chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, run_flag, restart_completion, shared_rows_handled, + restart_sampleOffs, restart_chunkElementsConsumedAndPath, restart_offset, chunk_pointer_pos, semiring); +} + + +#define GPUCompressedMatrixMatrixMultiplyMergeGeneralized(TYPE, THREADS, BLOCKS_PER_MP, NNZPERTHREAD, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS) \ + template void AcSpGEMMKernels::h_mergeSharedRowsGeneralized \ + (const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, \ + uint32_t* output_row_count, \ + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, \ + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \ + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, \ + uint32_t* restart_sampleOffs, uint32_t* restart_chunkElementsConsumedAndPath, uint32_t restart_offset, uint32_t* chunk_pointer_pos); + diff --git a/include/GALATIC/include/device/acSpGEMM_MergeMaxChunks.cuh b/include/GALATIC/include/device/acSpGEMM_MergeMaxChunks.cuh new file mode 100644 index 00000000..305ad039 --- /dev/null +++ b/include/GALATIC/include/device/acSpGEMM_MergeMaxChunks.cuh @@ -0,0 +1,890 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * MergeMaxChunks.cuh + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once + +// Local includes +#include "MultiplyKernels.h" +#include "../meta_utils.h" +#include // operator typeid + + +#include + +#define DIVISION_FACTOR 2 + + + + + +// ######################################################################################### +// Resampling +// + template +__device__ __forceinline__ void printSampling(const uint32_t* __restrict sharedRows, int numChunks, INDEX_TYPE (&id_samples)[MERGE_MAX_CHUNKS][MERGE_MAX_PATH_OPTIONS], + int row_index) +{ + if (sharedRows[blockIdx.x] == row_index && threadIdx.x == 0) + { + for (int i = 0; i < numChunks*MERGE_MAX_PATH_OPTIONS; ++i) + { + if (i % MERGE_MAX_PATH_OPTIONS == 0) + printf("\n"); + printf("%u ", id_samples[i / MERGE_MAX_PATH_OPTIONS][i % MERGE_MAX_PATH_OPTIONS]); + } + printf("\n"); + } +} + +__device__ __forceinline__ void printInvalidPath(const uint32_t* __restrict sharedRows) +{ + if (threadIdx.x == 0) + { + printf("%u\n", sharedRows[blockIdx.x]); + } +} + +__device__ __forceinline__ void printCountPerSampling(const uint32_t* __restrict sharedRows, uint32_t outputCount, uint32_t sampleID, uint32_t UpperBound, uint32_t row) +{ + if (outputCount < UpperBound && sharedRows[blockIdx.x] == row) + { + printf("Thread: %u -- Outputcount: %u -- SampleID: %u\n", threadIdx.x, outputCount, sampleID); + } +} + + + + + +// ######################################################################################### +// +// Max Chunks Case +// +// ######################################################################################### +template + __global__ void __launch_bounds__(THREADS, BLOCKS_PER_MP) +mergeSharedRowsMaxChunks(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, + OFFSET_TYPE* output_row_count, + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, + INDEX_TYPE** restart_chunkIndices, Either* restart_chunkValues, typename SEMIRING_t::leftInput_t* restart_multiplier, uint32_t* restart_chunkElementCount, uint32_t restart_offset, uint32_t* restart_num_chunks, uint32_t* chunk_pointer_pos, + SEMIRING_t semiring) +{ + using LEFT_T = typename SEMIRING_t::leftInput_t; + using RIGHT_t = typename SEMIRING_t::rightInput_t; + + using OUT_t = typename SEMIRING_t::output_t; + + using Chunk = ::Chunk; + + using DirectChunk = ::DirectChunk; + + const uint32_t ELEMENTS_PER_THREAD = 2 * INPUT_ELEMENTS_PER_THREAD; + using SingleLoadWorkDistribution = WorkDistribution; + using SortAndCombiner = SortAndCombine; + using ScanCombinerEntry = typename SortAndCombiner::ScanCombinerEntry; + const uint32_t PathEncodingBits = 32 - count_clz::value; + using PathEncoding = ChooseBitDataType::value>; + + + constexpr const uint32_t LengthSamplesPerThread = ((MERGE_MAX_PATH_OPTIONS + 1)*MERGE_MAX_CHUNKS + THREADS - 1) / THREADS; + + constexpr bool problem = LengthSamplesPerThread >= 1; + static_assert(problem, "LengthSamplesPerThread must be >= 1"); + + using SampleSorter = cub::BlockRadixSort; + using PathMergeScan = cub::BlockScan; + using IndexSorter = cub::BlockRadixSort; + + struct SMem + { + + + uint32_t runflag, restart, halveStep; + uint32_t startSharedRow, numSharedRow; + int numChunks; + int sumOut; + uint32_t completed; + PathEncoding usePath; + union { + INDEX_TYPE useMaxId; + uint32_t remCounter; + }; + uint32_t longChunkOffset; + const INDEX_TYPE* __restrict chunkIndices[MERGE_MAX_CHUNKS]; + Either chunkValues[MERGE_MAX_CHUNKS]; //RL FIXME : add restrict back to internal pointer types? + T multiplier[MERGE_MAX_CHUNKS]; + uint32_t chunkElementCount[MERGE_MAX_CHUNKS]; + volatile uint32_t chunkTakeElements[MERGE_MAX_CHUNKS]; + + // Used for sorting + uint32_t indexing[MERGE_MAX_CHUNKS]; + + union { + struct + { + ChunkSortType sort_keys[MERGE_MAX_CHUNKS]; + typename IndexSorter::TempStorage indexptrtempmem; + }; + struct + { + union { + INDEX_TYPE id_samples[MERGE_MAX_CHUNKS][MERGE_MAX_PATH_OPTIONS]; + struct { + typename SampleSorter::TempStorage sorterTempMem; + typename PathMergeScan::TempStorage pathmergeTempMem; + }; + struct { + uint32_t downStreamCount[THREADS + 1]; + INDEX_TYPE downStreamIndices[THREADS + 1]; + }; + }; + }; + struct { + typename SingleLoadWorkDistribution::SharedMemT single_workdistributionMem; + typename SingleLoadWorkDistribution::SharedTempMemT single_workdistributionTempMem; + typename SingleLoadWorkDistribution:: template SharedTempMemOutT single_workdistributionTempMemOutFull; + }; + typename SortAndCombiner::SMem single_sAndCMem; + struct { + OUT_t longOutDataBuffer[THREADS]; + INDEX_TYPE longOutIndexBuffer[THREADS]; + }; + }; + }; + + __shared__ SMem smem; + + //get my block's offset + if (threadIdx.x == 0) + { + uint32_t shared_handled = shared_rows_handled[blockIdx.x + restart_offset]; + smem.numSharedRow = 1 - shared_handled; + smem.runflag = *run_flag; + smem.restart = restart_completion[blockIdx.x + restart_offset]; + smem.sumOut = (smem.restart > RESTART_FIRST_ITERATION) ? output_row_count[sharedRows[blockIdx.x]] : 0; + smem.halveStep = 0; + } + __syncthreads(); + + if (smem.numSharedRow == 0) + return; + + + + // Read in chunks (maximum MERGE_MAX_CHUNKS) + if (threadIdx.x == 0 && smem.restart < RESTART_FIRST_ITERATION) + { + uint64_t chunk = reinterpret_cast(output_row_list_heads[sharedRows[blockIdx.x]]); + // if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE) + // printf("Row %d in MAX CHUNKS\n", sharedRows[blockIdx.x]); + uint32_t chunk_counter = 0; + uint32_t outsum = 0; + + while (chunk != 0) + { + bool first_row = (chunk & 2) != 0; + Chunk* __restrict pChunk = reinterpret_cast(chunk & 0xFFFFFFFFFFFFFFFCULL); + uint32_t count; + const INDEX_TYPE* pIndices; + Either pValues; + int32_t numentries = pChunk->num_entries; + typename SEMIRING_t::leftInput_t multiplier; + + smem.sort_keys[chunk_counter] = pChunk->sort_key; + + if (first_row) + { + //only first rows can be direct + if (pChunk->isDirect()) + { + DirectChunk* __restrict pDirectChunk = reinterpret_cast(pChunk); + count = numentries; + pIndices = pDirectChunk->indices_direct(numentries); + pValues = Either::First(pDirectChunk->values_direct(numentries)); + multiplier = pDirectChunk->getMultiplier(); + pDirectChunk->setFirstConsumed(); + chunk = reinterpret_cast(pChunk->readNextFront()); + } + else + { + count = pChunk->firstCountCleared(); + pChunk->setFirstConsumed(); + pIndices = pChunk->indices_direct(numentries); + pValues =Either::Second( pChunk->values_direct(numentries)); + chunk = reinterpret_cast(pChunk->readNextFront()); + } + } + else + { + count = pChunk->lastCountCleared(); + pChunk->setLastConsumed(); + uint32_t baseoffset = numentries - count; + pIndices = pChunk->indices_direct(numentries) + baseoffset; + pValues = Either::Second(pChunk->values_direct(numentries) + baseoffset); + chunk = reinterpret_cast(pChunk->readNextBack()); + } + + if (chunk_counter >= MERGE_MAX_CHUNKS) + { + printf("%d %d too many chunks: %d %d : count is : %u and should not be more than: %u\n", blockIdx.x, threadIdx.x, chunk_counter + 1, outsum + count, output_row_count[sharedRows[blockIdx.x]], ELEMENTS_PER_THREAD *THREADS * (MERGE_MAX_CHUNKS - 1)); + smem.runflag = 1; + } + else + { + smem.chunkIndices[chunk_counter] = pIndices; + smem.chunkValues[chunk_counter] = pValues; + smem.chunkElementCount[chunk_counter] = count; + smem.multiplier[chunk_counter] = multiplier; + } + // DEBUG + //if(sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE) + // printf("Chunk %d : Count: %d Row: %u\n", chunk_counter, count, sharedRows[blockIdx.x]); + // DEBUG + outsum += count; + ++chunk_counter; + } + + smem.numChunks = min(chunk_counter, MERGE_MAX_CHUNKS); + smem.completed = (outsum < ELEMENTS_PER_THREAD*THREADS) ? 1 : 0; + if (smem.restart == RESTART_OFF) + restart_num_chunks[(blockIdx.x)] = smem.numChunks; + } + else if (threadIdx.x == 0) + { + smem.numChunks = restart_num_chunks[(blockIdx.x)]; + smem.completed = 0; + } + __syncthreads(); + + if (smem.runflag != 0) + return; + + // Sorting only if >= RESTART_FIRST_ITERATION + { + uint32_t value[1]{threadIdx.x}; + if (smem.restart < RESTART_FIRST_ITERATION) + { + ChunkSortType key[1]; + + if (threadIdx.x < smem.numChunks) + key[0] = smem.sort_keys[threadIdx.x]; + else + key[0] = 0xFFFFFFFF; +#ifdef ENABLE_SORTING + IndexSorter(smem.indexptrtempmem).Sort(key, value); +#endif + } + + for (int i = threadIdx.x; i < MERGE_MAX_CHUNKS; i += THREADS) + { + smem.indexing[threadIdx.x] = value[0]; + } + } + __syncthreads(); + + // If elements can't be held in temp, load samples (MERGE_MAX_PATH_OPTIONS per chunk) + if (!smem.completed) + { + if (smem.restart >= RESTART_FIRST_ITERATION) + { + // Load values from last restart + for (int wip = threadIdx.x / MERGE_MAX_PATH_OPTIONS; wip < smem.numChunks; wip += THREADS / MERGE_MAX_PATH_OPTIONS) + { + uint32_t lid = threadIdx.x % MERGE_MAX_PATH_OPTIONS; + if (lid == 0) + { + // Do not use indexing here as we write the chunks out in correct order + smem.chunkElementCount[wip] = restart_chunkElementCount[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip]; + smem.chunkIndices[wip] = restart_chunkIndices[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip]; + //fixme: RL bad practice....... + smem.chunkValues[wip] = *reinterpret_cast*> (&restart_chunkValues[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip]); + smem.multiplier[wip] = restart_multiplier[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip]; + } + } + if (threadIdx.x == 0 && smem.restart == RESTART_ITERATION_FINISH) + { + // We want to finish in the next iteration + smem.completed = 1; + } + } + else + { + __syncthreads(); + // We start our first iteration soon + if (threadIdx.x == 0) + { + smem.restart = RESTART_FIRST_ITERATION; + } + } + __syncthreads(); + + //load samples from each list for column offset (warp based in parallel) + for (int wip = threadIdx.x / MERGE_MAX_PATH_OPTIONS; wip < smem.numChunks; wip += THREADS / MERGE_MAX_PATH_OPTIONS) + { + uint32_t lid = threadIdx.x % MERGE_MAX_PATH_OPTIONS; + uint32_t count = smem.chunkElementCount[smem.indexing[wip]]; + uint32_t step = (count + MERGE_MAX_PATH_OPTIONS - 1) / MERGE_MAX_PATH_OPTIONS; + uint32_t test = min(count - 1, step * lid); + INDEX_TYPE id = count > 0 ? smem.chunkIndices[smem.indexing[wip]][test] : 0xFFFFFFFF; + smem.id_samples[wip][lid] = id; + } + } + else if (threadIdx.x == 0) + { + // We are in the wrong case, remember that here + smem.restart = RESTART_WRONG_CASE; + } + __syncthreads(); + + // DEBUG + //printSampling(sharedRows, smem.numChunks, smem.id_samples, ROW_TO_INVESTIGATE); + // DEBUG + + while (true) + { + int chunkWorkElements[1]; + if (!smem.completed) + { + INDEX_TYPE mySampledIds[LengthSamplesPerThread]; + ushort2 mySamplePayload[LengthSamplesPerThread]; + +#pragma unroll + for (uint32_t i = 0; i < LengthSamplesPerThread; ++i) + { + uint32_t lid = i*THREADS + threadIdx.x; + uint32_t chunk = lid / (MERGE_MAX_PATH_OPTIONS + 1); + uint32_t sample = lid - chunk * (MERGE_MAX_PATH_OPTIONS + 1); + if (chunk < smem.numChunks) + { + mySampledIds[i] = sample == 0 ? 0 : smem.id_samples[chunk][sample - 1]; + mySamplePayload[i] = make_ushort2(chunk, sample); + } + else + { + mySampledIds[i] = 0xFFFFFFFF; + mySamplePayload[i] = make_ushort2(MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS + 1); + } + } + __syncthreads(); + + //sort according to index + SampleSorter(smem.sorterTempMem).Sort(mySampledIds, mySamplePayload); + + //construct bitmask + PathEncoding paths[LengthSamplesPerThread]; +#pragma unroll + for (uint32_t i = 0; i < LengthSamplesPerThread; ++i) + paths[i] = static_cast(mySamplePayload[i].y) << static_cast(mySamplePayload[i].x * PathEncodingBits); + //merge up + PathMergeScan(smem.pathmergeTempMem).InclusiveScan(paths, paths, PathMergerOp()); + + // reset and then compute output count + uint32_t outputCount[LengthSamplesPerThread]; + for (uint32_t i = 0; i < LengthSamplesPerThread; ++i) + outputCount[i] = 0; + + const PathEncoding Mask = (1 << PathEncodingBits) - 1; +#pragma unroll + for (uint32_t chunk = 0; chunk < MERGE_MAX_CHUNKS; ++chunk) + { + if (chunk < smem.numChunks) + { + uint32_t count = smem.chunkElementCount[smem.indexing[chunk]]; + uint32_t step = (count + MERGE_MAX_PATH_OPTIONS - 1) / MERGE_MAX_PATH_OPTIONS; +#pragma unroll + for (uint32_t i = 0; i < LengthSamplesPerThread; ++i) + { + uint32_t chunkPath = static_cast((paths[i] >> (PathEncodingBits * chunk)) & Mask); + outputCount[i] += min(count, step * chunkPath); + } + } + } + __syncthreads(); + + // ######## DEBUG + //printCountPerSampling(sharedRows, outputCount[0], mySampledIds[0], 2 * ELEMENTS_PER_THREAD*THREADS, ROW_TO_INVESTIGATE); + // ######## DEBUG + + //publish so next can check it + smem.downStreamCount[THREADS] = 0xFFFFFFFF; + smem.downStreamIndices[THREADS] = 0; + smem.downStreamIndices[threadIdx.x] = mySampledIds[0]; + + smem.usePath = 0; + smem.useMaxId = 0; + __syncthreads(); + + // Propagate outputcount locally first such that first element per array is correct +#pragma unroll + for (uint32_t i = LengthSamplesPerThread - 1; i > 0; --i) + if (mySampledIds[i - 1] == mySampledIds[i]) + outputCount[i - 1] = outputCount[i]; + + smem.downStreamCount[threadIdx.x] = outputCount[0]; + __syncthreads(); + + //propagate count over equal ids over arrays + bool prop = mySampledIds[0] == smem.downStreamIndices[threadIdx.x + 1] && + mySampledIds[0] != 0xFFFFFFFF; + bool changed; + do + { + changed = prop && smem.downStreamCount[threadIdx.x + 1] != outputCount[0]; + if (changed) + smem.downStreamCount[threadIdx.x] = outputCount[0] = smem.downStreamCount[threadIdx.x + 1]; + changed = __syncthreads_or(changed); + } while (changed); + + //propagate count locally again + if (mySampledIds[LengthSamplesPerThread - 1] == smem.downStreamIndices[threadIdx.x + 1]) + outputCount[LengthSamplesPerThread - 1] = smem.downStreamCount[threadIdx.x + 1]; +#pragma unroll + for (uint32_t i = LengthSamplesPerThread - 1; i > 0; --i) + if (mySampledIds[i - 1] == mySampledIds[i]) + outputCount[i - 1] = outputCount[i]; + + // ######## DEBUG + //printCountPerSampling(sharedRows, outputCount[0], mySampledIds[0], 2 * ELEMENTS_PER_THREAD*THREADS, ROW_TO_INVESTIGATE); + // ######## DEBUG + + //find the first that goes over the threshold + if (outputCount[LengthSamplesPerThread - 1] <= ELEMENTS_PER_THREAD*THREADS && smem.downStreamCount[threadIdx.x + 1] > ELEMENTS_PER_THREAD*THREADS) + { + // ######## DEBUG + /*if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE) + printf("THREAD: %u Outputcount: %u Next Count %u | path: %llu maxid: %u\n", threadIdx.x, outputCount[LengthSamplesPerThread - 1], smem.downStreamCount[threadIdx.x + 1], paths[LengthSamplesPerThread - 1], smem.downStreamIndices[threadIdx.x + 1]);*/ + // ######## DEBUG + smem.usePath = paths[LengthSamplesPerThread - 1]; + smem.useMaxId = smem.downStreamIndices[threadIdx.x + 1]; + } + +#pragma unroll + for (uint32_t i = 0; i < LengthSamplesPerThread - 1; ++i) + { + if (outputCount[i] <= ELEMENTS_PER_THREAD*THREADS && outputCount[i + 1] > ELEMENTS_PER_THREAD*THREADS) + { + smem.usePath = paths[i]; + smem.useMaxId = mySampledIds[i + 1]; + } + } + + smem.completed = 1; + __syncthreads(); + + if (smem.usePath == 0) + { + //if (sharedRows[blockIdx.x] != ROW_TO_INVESTIGATE) + // return; + + // ######## DEBUG + /*if(sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE) + printInvalidPath(sharedRows);*/ + // ######## DEBUG + + if (threadIdx.x == 0) + { + smem.useMaxId = UINT32_MAX; + smem.halveStep = 1; + // ######## DEBUG + /*if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE) + printf("-----------------------------------------------------------------------------------\n");*/ + // ######## DEBUG + } + __syncthreads(); + + // Go one half step -> get smallest ID + // -> all chunks should reach this with now at most half the workload + if (threadIdx.x < smem.numChunks) + { + uint32_t count = smem.chunkElementCount[smem.indexing[threadIdx.x]]; + int step = ((count + (MERGE_MAX_PATH_OPTIONS * DIVISION_FACTOR) - 1) / (MERGE_MAX_PATH_OPTIONS * DIVISION_FACTOR)); + if (count > 1) + { + INDEX_TYPE id = smem.chunkIndices[smem.indexing[threadIdx.x]][step]; + // ######## DEBUG + //if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE) + // printf("Chunk: %d with Count: %d - step: %d| Check out ID for chunk: %u\n", threadIdx.x, count, step, id); + // ######## DEBUG + atomicMin(&(smem.useMaxId), id); + } + } + __syncthreads(); + + // Select all chunks that are below this ID + if (threadIdx.x == 0) + { + // ######## DEBUG + /*if (sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE) + printf("MaxID chosen: %u\n", smem.useMaxId);*/ + // ######## DEBUG + for (int i = 0; i < smem.numChunks; ++i) + { + if (smem.chunkElementCount[smem.indexing[i]] > 0 && smem.chunkIndices[smem.indexing[i]][0] < smem.useMaxId) + { + // Take these chunks -> for each chunk set the path to 1 + smem.usePath |= static_cast(1) << static_cast(i * PathEncodingBits); + } + } + } + __syncthreads(); + } + // ###################################################################################################################################################### + + + //determine actual chunk ends to use + for (int wip = threadIdx.x / WARP_SIZE; wip < smem.numChunks; wip += THREADS / WARP_SIZE) + { + const PathEncoding PathCodingMask = (1 << PathEncodingBits) - 1; + int lpos = static_cast((smem.usePath >> (wip*PathEncodingBits)) & PathCodingMask); + int count = smem.chunkElementCount[smem.indexing[wip]]; + int step; + if (smem.halveStep) + step = ((count + (MERGE_MAX_PATH_OPTIONS * DIVISION_FACTOR) - 1) / (MERGE_MAX_PATH_OPTIONS * DIVISION_FACTOR)); + else + step = (count + MERGE_MAX_PATH_OPTIONS - 1) / MERGE_MAX_PATH_OPTIONS; + int startpos = max(0, step * (lpos - 1)); + int endpos = min(count, step * lpos); + + smem.chunkTakeElements[wip] = endpos; + int current = endpos; + + for (int i = startpos + laneid(); i < endpos; i += WARP_SIZE) + { + INDEX_TYPE next = static_cast(-1); + if (i < count - 1) + next = smem.chunkIndices[smem.indexing[wip]][i + 1]; + if (smem.chunkIndices[smem.indexing[wip]][i] < smem.useMaxId && smem.useMaxId <= next) + current = i + 1; + } + + uint32_t found = __ballot_sync(0xFFFFFFFF, current != endpos); + if (found != 0) + { + current = __shfl_sync(0xFFFFFFF, current, __ffs(found) - 1); + smem.chunkTakeElements[wip] = current; + } + + //not reduced to 0 -> set completed false + if (current != count) + smem.completed = 0; + } + __syncthreads(); + + + chunkWorkElements[0] = 0; + if (threadIdx.x < smem.numChunks) + { + chunkWorkElements[0] = smem.chunkTakeElements[threadIdx.x]; + } + } + else + { + //we can combine all at once! + chunkWorkElements[0] = 0; + if (threadIdx.x < smem.numChunks) + chunkWorkElements[0] = smem.chunkElementCount[smem.indexing[threadIdx.x]]; + } + + //use workdistribution to assign for loading + SingleLoadWorkDistribution:: template initialize(smem.single_workdistributionMem, smem.single_workdistributionTempMem, chunkWorkElements); + + int chunk[ELEMENTS_PER_THREAD]; + int element[ELEMENTS_PER_THREAD]; + + int elements = SingleLoadWorkDistribution:: template assignWorkAllThreads( + smem.single_workdistributionMem, smem.single_workdistributionTempMem, smem.single_workdistributionTempMemOutFull, + chunk, element); + + // ######## DEBUG + if (threadIdx.x == 0 && elements == 0 /*&& sharedRows[blockIdx.x] == ROW_TO_INVESTIGATE*/) + { + //printf("Row: %u got 0 elements with maxID: %u\n", sharedRows[blockIdx.x], smem.useMaxId); + } + // ######## DEBUG + + int numOut; + // Combine entries + ScanCombinerEntry combinedEntries[ELEMENTS_PER_THREAD]; + { + uint32_t combIndex[ELEMENTS_PER_THREAD]; + typename SEMIRING_t::output_t data[ELEMENTS_PER_THREAD]; +#pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD; ++i) + { + if (element[i] >= 0) + { + const INDEX_TYPE* __restrict ip = smem.chunkIndices[smem.indexing[chunk[i]]]; + combIndex[i] = ip[element[i]]; + + const Either dp = smem.chunkValues[smem.indexing[chunk[i]]]; + + if (dp.isFirst()) { + auto idx_ = element[i]; + RIGHT_t right_ = dp.valFirst()[idx_]; + auto idx_r_ = chunk[i]; + auto idx_r_2_ = smem.indexing[idx_r_]; + auto left_ = smem.multiplier[idx_r_2_]; + data[i] = semiring.multiply(left_ , right_); + } else { + auto idx_ = element[i]; + data[i] = dp.valSecond()[idx_]; + } + } + else + { + data[i] = SEMIRING_t::AdditiveIdentity(); + combIndex[i] = static_cast(-1); + } + } + __syncthreads(); + + + auto & j =smem.single_sAndCMem; + + auto fo = [](auto a, auto b) { + return a == b; + }; + + auto bq = [](auto a, auto b) { + return true; + }; + + numOut = 2; + numOut = SortAndCombiner::combine(j, + combIndex, + data, + combinedEntries, + fo,bq + , semiring); + + + __syncthreads(); + // ######## DEBUG + /*if (numOut == 0 && threadIdx.x == 0) + { + printf("%d %d oops in max chunks\n", blockIdx.x, threadIdx.x); + }*/ + //if (numOut == 0) + // return; + // ######## DEBUG + } + + // create new chunk (could also reuse old ones if completely used up...?) + if (threadIdx.x == 0) + { + // Try to allocate chunk + uint32_t chunkoff; + int ignored; + // Update pre alloc before the actual allocation + if (!allocChunk(numOut, chunk_alloc, chunk_size, chunkoff, ignored, false)) + { + chunkoff = static_cast(-1); + atomicOr(run_flag, 0x1); + // Write restart state + restart_completion[blockIdx.x + restart_offset] = smem.restart; + } + else + { + //need to add flag and offset for copy later (offset = s) + uint32_t s = smem.sumOut; + //write chunk header + INDEX_TYPE actualrow = sharedRows[blockIdx.x]; + //write chunk pointer + uint32_t chunk_pointer_position = atomicAdd(chunk_pointer_alloc, 1); + if (chunk_pointer_position >= chunk_pointer_sizes) + { + chunkoff = static_cast(-1); + atomicOr(run_flag,0x2); + if(chunk_pointer_position == chunk_pointer_sizes) + *chunk_pointer_pos = chunk_pointer_sizes; + // Write restart state + restart_completion[blockIdx.x + restart_offset] = smem.restart; + } + else + { + chunks_pointers[chunk_pointer_position] = reinterpret_cast(Chunk::place(chunks, chunkoff, numOut, actualrow, Chunk::StartingOffsetFlag | s, 0)); + //write row count + s += numOut; + smem.sumOut = s; + output_row_count[actualrow] = s; + } + } + smem.longChunkOffset = chunkoff; + } + + smem.remCounter = 0; + __syncthreads(); + + if (smem.longChunkOffset == static_cast(-1)) + { + // Write out current state and return + for (int wip = threadIdx.x / MERGE_MAX_PATH_OPTIONS; wip < smem.numChunks; wip += THREADS / MERGE_MAX_PATH_OPTIONS) + { + uint32_t lid = threadIdx.x % MERGE_MAX_PATH_OPTIONS; + if (lid == 0) + { + restart_chunkElementCount[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip] = smem.chunkElementCount[smem.indexing[wip]]; + restart_multiplier[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip] = smem.multiplier[smem.indexing[wip]]; + restart_chunkIndices[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip] = const_cast(smem.chunkIndices[smem.indexing[wip]]); + // FIXME: RL - casting like this is a sin + restart_chunkValues[((blockIdx.x) * MERGE_MAX_CHUNKS) + wip] = *reinterpret_cast*>(&smem.chunkValues[smem.indexing[wip]]); + } + } + return; + } + + //loop over data and write out + for (uint32_t written = 0; written < numOut; written += THREADS) + { + //store in shared for coalesced out +#pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD; ++i) + { + uint32_t poffset = combinedEntries[i].memoffset(); + if (combinedEntries[i].isResult() && + poffset >= written && poffset < written + THREADS) + { + uint32_t pwrite = poffset - written; + smem.longOutDataBuffer[pwrite] = combinedEntries[i].value; + smem.longOutIndexBuffer[pwrite] = combinedEntries[i].index; + } + } + __syncthreads(); + + //write out + if (written + threadIdx.x < numOut) + { + typename SEMIRING_t::output_t* valstart = Chunk::cast(chunks, smem.longChunkOffset)->values_direct(numOut); + INDEX_TYPE* indexstart = Chunk::cast(chunks, smem.longChunkOffset)->indices_direct(numOut); + + valstart[written + threadIdx.x] = smem.longOutDataBuffer[threadIdx.x]; + indexstart[written + threadIdx.x] = smem.longOutIndexBuffer[threadIdx.x]; + } + __syncthreads(); + } + + // Work is done, we can stop now + if (smem.completed) + break; + + //reduce all counts and adjust pointers + for (int wip = threadIdx.x / MERGE_MAX_PATH_OPTIONS; wip < smem.numChunks; wip += THREADS / MERGE_MAX_PATH_OPTIONS) + { + uint32_t lid = threadIdx.x % MERGE_MAX_PATH_OPTIONS; + uint32_t count = smem.chunkElementCount[smem.indexing[wip]]; + uint32_t rem = smem.chunkTakeElements[wip]; + + uint32_t newcount = count - rem; + smem.chunkElementCount[smem.indexing[wip]] = newcount; + const INDEX_TYPE* __restrict newchunkIndices = smem.chunkIndices[smem.indexing[wip]] + rem; + smem.chunkIndices[smem.indexing[wip]] = newchunkIndices; + Either newchunkValues; //fixme RL : add restrict on interior types? + + if (smem.chunkValues[smem.indexing[wip]].isFirst()) { + newchunkValues = Either::First(smem.chunkValues[smem.indexing[wip]].valFirst() + rem); + } else { + newchunkValues = Either::Second(smem.chunkValues[smem.indexing[wip]].valSecond() + rem); + } + + smem.chunkValues[smem.indexing[wip]] = newchunkValues; + + uint32_t step = (newcount + MERGE_MAX_PATH_OPTIONS - 1) / MERGE_MAX_PATH_OPTIONS; + uint32_t test = min(newcount - 1, step * lid); + INDEX_TYPE id = newcount > 0 ? newchunkIndices[test] : 0xFFFFFFFF; + smem.id_samples[wip][lid] = id; + + if (lid == 0) + atomicAdd(&smem.remCounter, newcount); + } + __syncthreads(); + + // ######## DEBUG + //printSampling(sharedRows, smem.numChunks, smem.id_samples, ROW_TO_INVESTIGATE); + // ######## DEBUG + + smem.completed = smem.remCounter < ELEMENTS_PER_THREAD*THREADS ? 1 : 0; + if (threadIdx.x == 0) + { + smem.restart = smem.completed ? RESTART_ITERATION_FINISH : RESTART_ITERATION_UNKNOWN; + smem.halveStep = 0; + } + __syncthreads(); + } + + // This row is done + if (threadIdx.x == 0) + { + shared_rows_handled[blockIdx.x + restart_offset] = 1; + } + + return; +} + + +template + void AcSpGEMMKernels::h_mergeSharedRowsMaxChunks(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, + OFFSET_TYPE* output_row_count, uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, + INDEX_TYPE** restart_chunkIndices, Either* restart_chunkValues, typename SEMIRING_t::leftInput_t* restart_multiplier, uint32_t* restart_chunkElementCount, uint32_t restart_offset, uint32_t* restart_num_chunks, uint32_t* chunk_pointer_pos, SEMIRING_t semiring) +{ + + mergeSharedRowsMaxChunks<<>>( + blockOffsets, sharedRows, output_row_list_heads, output_row_count, chunks, chunk_alloc, chunk_pre_alloc, chunk_size, + chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, run_flag, restart_completion, shared_rows_handled, + restart_chunkIndices, restart_chunkValues, restart_multiplier, restart_chunkElementCount, restart_offset, restart_num_chunks, chunk_pointer_pos, semiring); +} + + +#define GPUCompressedMatrixMatrixMultiplyMergeMaxChunks(TYPE, THREADS, BLOCKS_PER_MP, NNZPERTHREAD, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS) \ + template void AcSpGEMMKernels::h_mergeSharedRowsMaxChunks \ + (const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, \ + uint32_t* output_row_count, \ + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, \ + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \ + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, \ + uint32_t** restart_chunkIndices, TYPE** restart_chunkValues, TYPE* restart_multiplier, uint32_t* restart_chunkElementCountDataOffset2, uint32_t restart_offset, uint32_t* restart_num_chunks, uint32_t* chunk_pointer_pos); diff --git a/include/GALATIC/include/device/acSpGEMM_MergeSimple.cuh b/include/GALATIC/include/device/acSpGEMM_MergeSimple.cuh new file mode 100644 index 00000000..09bb0d02 --- /dev/null +++ b/include/GALATIC/include/device/acSpGEMM_MergeSimple.cuh @@ -0,0 +1,393 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * MergeSimple.cuh + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once + +#include "MultiplyKernels.h" + +// ######################################################################################### +// +// Simple Case +// +// ######################################################################################### +template +__global__ void __launch_bounds__(THREADS, BLOCKS_PER_MP) +mergeSharedRowsSimple(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, + OFFSET_TYPE* output_row_count, + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring) +{ + using Chunk = ::Chunk; + const uint32_t ELEMENTS_PER_THREAD = 2 * INPUT_ELEMENTS_PER_THREAD; + using SortType = ChooseBitDataType; + const uint32_t SharedRowsShift = LONG_SORT ? 32 : count_clz::value; + const uint32_t SharedRowsBits = 32 - count_clz::value; + const SortType SharedRowsColMask = (SortType(1) << SharedRowsShift) - 1; + const SortType SharedRowsMaskShifted = ~SharedRowsColMask; + using LoadWorkDistribution = WorkDistribution; + using SortAndCombiner = SortAndCombine; + using ScanCombinerEntry = typename SortAndCombiner::ScanCombinerEntry; + + struct SMem + { + + uint32_t runflag, chunk_pointer_position; + uint32_t startSharedRow, numSharedRow; + INDEX_TYPE minColumnId[THREADS]; + + union + { + struct + { + const typename SEMIRING_t::output_t* dataPointer[2 * THREADS]; + union + { + ushort2 fromDataOffset[THREADS]; + uint16_t dataToIndexOffset[2 * THREADS]; + }; + struct { + typename LoadWorkDistribution::SharedMemT workdistributionMem; + typename LoadWorkDistribution::SharedTempMemT workdistributionTempMem; + typename LoadWorkDistribution:: template SharedTempMemOutT workdistributionTempMemOutFull; + }; + }; + + typename SortAndCombiner::SMem sAndCMem; + + struct + { + typename SEMIRING_t::output_t outDataBuffer[THREADS]; + INDEX_TYPE outIndexBuffer[THREADS]; + ushort2 outRowIdRowOffsetBuffer[THREADS]; + uint32_t outRowCounts[THREADS]; + uint32_t outChunkOffset[THREADS]; + }; + }; + }; + + __shared__ SMem smem; + + //get my block's offset + if (threadIdx.x == 0) + { + uint32_t bstart = blockOffsets[blockIdx.x]; + uint32_t shared_handled = shared_rows_handled[blockIdx.x + restart_offset]; + smem.startSharedRow = bstart + shared_handled; + smem.numSharedRow = blockOffsets[blockIdx.x + 1] - (bstart + shared_handled); + smem.runflag = *run_flag; + } + + __syncthreads(); + + if (smem.numSharedRow == 0) + return; + + int count[2] = { 0, 0 }; + + //load all chunk information + if (threadIdx.x < smem.numSharedRow) + { + uint32_t idoffset[2] = { 0, 0 }; + uint32_t access_index[2] = { 0, 1 }; + uint64_t chunk = reinterpret_cast(output_row_list_heads[sharedRows[smem.startSharedRow + threadIdx.x]]); + // if (sharedRows[smem.startSharedRow + threadIdx.x] == ROW_TO_INVESTIGATE) + // printf("Row %d in SIMPLE\n", sharedRows[smem.startSharedRow + threadIdx.x]); + bool first_row = (chunk & 2) != 0; + Chunk* __restrict pChunk = reinterpret_cast(chunk & 0xFFFFFFFFFFFFFFFCULL); + Chunk* __restrict second; + if (first_row) + { + second = pChunk->readNextFront(); + } + else + { + second = pChunk->readNextBack(); + } + bool first_row2 = (reinterpret_cast(second) & 2) != 0; + second = reinterpret_cast(reinterpret_cast(second) & 0xFFFFFFFFFFFFFFFCULL); + +#ifdef ENABLE_SORTING + if (second->sort_key < pChunk->sort_key) + { + // Reverse access order + access_index[0] = 1; + access_index[1] = 0; + } +#endif + + INDEX_TYPE minColumnId; + + const typename SEMIRING_t::output_t* pdata; + idoffset[0] = pChunk->num_entries; + if (first_row) + { + count[access_index[0]] = pChunk->firstCountCleared(); + pdata = pChunk->values_direct(idoffset[0]); + minColumnId = pChunk->indices_direct(idoffset[0])[0]; + idoffset[0] = idoffset[0] * sizeof(typename SEMIRING_t::output_t); + pChunk->setFirstConsumed(); + } + else + { + count[access_index[0]] = pChunk->lastCountCleared(); + uint32_t baseoffset = idoffset[0] - count[access_index[0]]; + pdata = pChunk->values_direct(idoffset[0]) + baseoffset; + minColumnId = pChunk->indices_direct(idoffset[0])[baseoffset]; + idoffset[0] = count[access_index[0]] * sizeof(typename SEMIRING_t::output_t) + baseoffset * sizeof(INDEX_TYPE); + pChunk->setLastConsumed(); + } + + smem.dataPointer[2 * threadIdx.x + access_index[0]] = pdata; + + idoffset[1] = second->num_entries; + //we dont need to figure out whether the second pointer is front or back, as front follows back and vice versa + if (first_row2) + { + count[access_index[1]] = second->firstCountCleared(); + minColumnId = min(minColumnId, second->indices_direct(idoffset[1])[0]); + pdata = second->values_direct(idoffset[1]); + idoffset[1] = idoffset[1] * sizeof(typename SEMIRING_t::output_t); + second->setFirstConsumed(); + } + else + { + count[access_index[1]] = second->lastCountCleared(); + uint32_t baseoffset = idoffset[1] - count[access_index[1]]; + minColumnId = min(minColumnId, second->indices_direct(idoffset[1])[baseoffset]); + pdata = second->values_direct(idoffset[1]) + baseoffset; + idoffset[1] = count[access_index[1]] * sizeof(typename SEMIRING_t::output_t) + baseoffset * sizeof(INDEX_TYPE); + second->setLastConsumed(); + } + + smem.dataPointer[2 * threadIdx.x + access_index[1]] = pdata; + smem.fromDataOffset[threadIdx.x] = make_ushort2(idoffset[access_index[0]], idoffset[access_index[1]]); + smem.minColumnId[threadIdx.x] = minColumnId; + } + + //use workdistribution to assign for loading + LoadWorkDistribution::template initialize(smem.workdistributionMem, smem.workdistributionTempMem, count); + + int rowPair[ELEMENTS_PER_THREAD]; + int element[ELEMENTS_PER_THREAD]; + + int elements = LoadWorkDistribution:: template assignWorkAllThreads( + smem.workdistributionMem, smem.workdistributionTempMem, smem.workdistributionTempMemOutFull, + rowPair, element); + + int numOut; + ScanCombinerEntry combinedEntries[ELEMENTS_PER_THREAD]; + { + SortType combIndex[ELEMENTS_PER_THREAD]; + typename SEMIRING_t::output_t data[ELEMENTS_PER_THREAD]; +#pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD; ++i) + { + if (element[i] >= 0) + { + const typename SEMIRING_t::output_t* dp = smem.dataPointer[rowPair[i]]; + const INDEX_TYPE* colptr = reinterpret_cast(reinterpret_cast(dp) + smem.dataToIndexOffset[rowPair[i]]); + INDEX_TYPE colid = colptr[element[i]]; + data[i] = dp[element[i]]; + uint32_t rowId = rowPair[i] / 2; + SortType redcolid = colid - smem.minColumnId[rowId]; + /*if (redcolid >= (SortType(1) << SharedRowsShift)) + printf("data mix up happening: %d >= %d (shift %d, off %d)!\n", redcolid, 1 << SharedRowsShift, SharedRowsShift, smem.minColumnId[rowId]);*/ + combIndex[i] = (static_cast(rowId) << SharedRowsShift) | redcolid; + } + else + { + data[i] = SEMIRING_t::AdditiveIdentity(); + combIndex[i] = static_cast(-1); + } + } + + __syncthreads(); + + numOut = SortAndCombiner::combine(smem.sAndCMem, combIndex, data, combinedEntries, + [](auto a, auto b) { + return a == b; + }, + [SharedRowsMaskShifted](auto a, auto b) { + return (a & SharedRowsMaskShifted) == (b & SharedRowsMaskShifted); + }, semiring, LONG_SORT ? (32 + SharedRowsBits + 1) : 32); + } + + __syncthreads(); + + //write count for rows + for (int i = 0; i < ELEMENTS_PER_THREAD; ++i) + { + if (combinedEntries[i].isRowend()) + { + uint32_t row = combinedEntries[i].index >> SharedRowsShift; + uint32_t rcount = combinedEntries[i].rowcount(); + smem.outRowCounts[row] = rcount; + } + } + + __syncthreads(); + + // Let's see if we can go ahead + if (threadIdx.x < smem.numSharedRow) + { + uint32_t chunkoff = 0xFFFFFFFF; + int ignored; + uint32_t elcount = smem.outRowCounts[threadIdx.x]; + if (!allocChunk(elcount, chunk_alloc, chunk_size, chunkoff, ignored, false)) + { + // We have to restart for this block at this point, set run_flag and remember how many rows are left + atomicOr(run_flag, 0x1); + smem.runflag = 1; + } + else + { + smem.outChunkOffset[threadIdx.x] = chunkoff; + } + } + __syncthreads(); + if (smem.runflag != 0) + { + return; + } + + if (threadIdx.x == 0) + { + smem.chunk_pointer_position = atomicAdd(chunk_pointer_alloc, smem.numSharedRow); + if (smem.chunk_pointer_position + smem.numSharedRow > chunk_pointer_sizes) + { + atomicOr(run_flag, 0x2); + smem.runflag = 1; + if (smem.chunk_pointer_position <= chunk_pointer_sizes) + *chunk_pointer_pos = smem.chunk_pointer_position; + } + } + __syncthreads(); + if (smem.runflag != 0) + { + return; + } + + // Allocate chunk for each row and update count in global + if (threadIdx.x < smem.numSharedRow) + { + uint32_t elcount = smem.outRowCounts[threadIdx.x]; + INDEX_TYPE actualrow = sharedRows[smem.startSharedRow + threadIdx.x]; + //write chunk pointer + chunks_pointers[smem.chunk_pointer_position + threadIdx.x] = reinterpret_cast(Chunk::place(chunks, smem.outChunkOffset[threadIdx.x], elcount, actualrow, 0, 0)); + //write row count + output_row_count[actualrow] = elcount; + } + + //loop over data and write out + for (uint32_t written = 0; written < numOut; written += THREADS) + { + //store in shared for coalesced out +#pragma unroll + for (int i = 0; i < ELEMENTS_PER_THREAD; ++i) + { + uint32_t poffset = combinedEntries[i].memoffset(); + if (combinedEntries[i].isResult() && + poffset >= written && poffset < written + THREADS) + { + uint32_t pwrite = poffset - written; + uint32_t row = combinedEntries[i].index >> SharedRowsShift; + smem.outDataBuffer[pwrite] = combinedEntries[i].value; + smem.outIndexBuffer[pwrite] = static_cast(combinedEntries[i].index & SharedRowsColMask) + smem.minColumnId[row]; + smem.outRowIdRowOffsetBuffer[pwrite] = make_ushort2(row, combinedEntries[i].rowcount() - 1); + } + } + __syncthreads(); + + //write out + if (written + threadIdx.x < numOut) + { + ushort2 row_offset = smem.outRowIdRowOffsetBuffer[threadIdx.x]; + uint32_t chunkoffset = smem.outChunkOffset[row_offset.x]; + if (chunkoffset != 0xFFFFFFFF) + { + uint32_t count = smem.outRowCounts[row_offset.x]; + typename SEMIRING_t::output_t* valstart = Chunk::cast(chunks, chunkoffset)->values_direct(count); + INDEX_TYPE* indexstart = Chunk::cast(chunks, chunkoffset)->indices_direct(count); + valstart[row_offset.y] = smem.outDataBuffer[threadIdx.x]; + indexstart[row_offset.y] = smem.outIndexBuffer[threadIdx.x]; + } + } + __syncthreads(); + } + + // Indicator for restart + if (threadIdx.x == 0) + shared_rows_handled[blockIdx.x + restart_offset] += smem.numSharedRow; + + return; +} + + +template + void AcSpGEMMKernels::h_mergeSharedRowsSimple(const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, + OFFSET_TYPE* output_row_count, + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, uint32_t restart_offset, uint32_t* chunk_pointer_pos, SEMIRING_t semiring) +{ + mergeSharedRowsSimple<<>>( + blockOffsets, sharedRows, output_row_list_heads, output_row_count, chunks, chunk_alloc, chunk_pre_alloc, chunk_size, + chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, run_flag, restart_completion, shared_rows_handled, restart_offset, chunk_pointer_pos, semiring); +} + +#define GPUCompressedMatrixMatrixMultiplyMergeSimple(TYPE, THREADS, BLOCKS_PER_MP, NNZPERTHREAD, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS) \ + template void AcSpGEMMKernels::h_mergeSharedRowsSimple \ + (const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, \ + uint32_t* output_row_count, \ + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, \ + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \ + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, uint32_t restart_offset, uint32_t* chunk_pointer_pos); \ + template void AcSpGEMMKernels::h_mergeSharedRowsSimple \ + (const uint32_t* __restrict blockOffsets, const uint32_t* __restrict sharedRows, void** output_row_list_heads, \ + uint32_t* output_row_count, \ + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_pre_alloc, uint32_t chunk_size, \ + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \ + uint32_t* run_flag, uint32_t* restart_completion, uint32_t* shared_rows_handled, uint32_t restart_offset, uint32_t* chunk_pointer_pos); + \ No newline at end of file diff --git a/include/GALATIC/include/device/acSpGEMM_SpGEMM.cuh b/include/GALATIC/include/device/acSpGEMM_SpGEMM.cuh new file mode 100644 index 00000000..f118e910 --- /dev/null +++ b/include/GALATIC/include/device/acSpGEMM_SpGEMM.cuh @@ -0,0 +1,1132 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * SpGEMM.cuh + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +#pragma once + +#include +#include +#include +#include "MultiplyKernels.h" +#include "Chunk.cuh" +#include "HelperFunctions.cuh" +#include "WorkDistribution.cuh" +#include "ARowStorage.cuh" +#include "SortAndCombine.cuh" + + +//SORT_TYPE_MODE 0 .. 32bit direct, 1 32bit row remap, 2 64bit full +template + __global__ void __launch_bounds__(THREADS, BLOCKS_PER_MP) +computeSpgemmPart( + const typename SEMIRING_t::leftInput_t* valA, const INDEX_TYPE* indicesA, const OFFSET_TYPE* __restrict offsetsA, + const typename SEMIRING_t::rightInput_t *__restrict valB, const INDEX_TYPE* __restrict indicesB, const OFFSET_TYPE* __restrict offsetsB, + const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows, + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + OFFSET_TYPE* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count, + uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv, + uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos, SEMIRING_t semiring) +{ + static_assert(RETAIN_ELEMENTS_PER_THREAD >= 1, "need at least one temporary element per thread to assure coalesced write out"); + // fetch A data + // tag with row and col ids + // + // fill work distribution + // + // fetch rows from B (each thread fetches one element) + // multiply and sort in (multiply, sort, prefix sum) + // run a scan to combine, compute row offset, and memory offset + // + // either write out chunk or keep all data/last row in shared memory to continue the combination + + const int NNZ_PER_BLOCK = NNZ_PER_THREAD*THREADS; + const int TEMP_ITEMS_PER_BLOCK = (RETAIN_ELEMENTS_PER_THREAD*THREADS); + + using LEFT_T = typename SEMIRING_t::leftInput_t; + using RIGHT_t = typename SEMIRING_t::rightInput_t; + + //SORT_TYPE_MODE 0 .. 32bit direct, 1 32bit row remap, 2 64bit full + using SortType = ChooseBitDataType<(SORT_TYPE_MODE > 1 ) ? 64 : 32>; + + const uint32_t ChunkSortingBits = (sizeof(ChunkSortType) * 8) - count_clz::value; + + // the number of elements each threads handles in registers + const int CombineElements = INPUT_ELEMENTS_PER_THREAD + RETAIN_ELEMENTS_PER_THREAD; + + // cutoff for rows in B which will directly be forwarded to the merge stage + const uint32_t LongRowCutOff = CombineElements * THREADS / 2; + + // used data types specialized for the setup + using RowelementWorkDistribution = WorkDistribution; + using SortAndCombiner = SortAndCombine; + using ScanCombinerEntry = typename SortAndCombiner::ScanCombinerEntry; + using SimpleScan = cub::BlockScan; + using SimpleIntScan = cub::BlockScan; + using Chunk = Chunk; + using DirectChunk = DirectChunk; + + using ARowStorage = ARowStorage; + struct SMem + { + + // flattened out A data + //INDEX_TYPE A_row_ids[NNZ_PER_BLOCK]; + uint32_t chunk_pointer_position, chunk_counter; + ARowStorage A_row_ids; + INDEX_TYPE A_col_ids[NNZ_PER_BLOCK]; + typename SEMIRING_t::leftInput_t A_indata[NNZ_PER_BLOCK]; + + + + + // comb data + union { + struct { + INDEX_TYPE current_col_ids[TEMP_ITEMS_PER_BLOCK]; + typename ARowStorage::EncodedRowType current_row_ids[TEMP_ITEMS_PER_BLOCK < THREADS ? THREADS + 1 : TEMP_ITEMS_PER_BLOCK]; + typename SEMIRING_t::output_t current_output[TEMP_ITEMS_PER_BLOCK]; + }; + struct { + uint32_t temp_work_storage_single[NNZ_PER_BLOCK]; + }; + }; + + //TODO: temp mem and comb data could be overlapped!? + + // temp mem + union { + struct { + typename RowelementWorkDistribution::SharedTempMemT workdistributionTempMem; + typename RowelementWorkDistribution:: template SharedTempMemOutT workdistributionTempMemOutFull; + }; + struct { + typename SimpleScan::TempStorage directChunkScanTempMem; + typename SimpleScan::TempStorage nonDirectChunkScanTempMem; + }; + typename SimpleIntScan::TempStorage intScanTempMem; + typename SortAndCombiner::SMem sAndCMem; + INDEX_TYPE rowCounts[TEMP_ITEMS_PER_BLOCK]; + }; + + + //work distribution + typename RowelementWorkDistribution::SharedMemT workdistributionMem; + + INDEX_TYPE minCol, maxCol; + typename ARowStorage::EncodedRowType minRow, maxRow; + + uint32_t chunkStartOffset; + uint32_t firstRowCount; + uint32_t lastRowCount; + uint32_t runflag; + uint32_t directChunkRows; + uint32_t brokenChunkOffsetStart, brokenChunkOffsetEnd; + + typename ARowStorage::EncodedRowType minBrokenChunkRow, maxBrokenChunkRow; + }; + + __shared__ SMem smem; + + __shared__ uint32_t block_start_end[2]; + //__shared__ int currentStartElementIndex, currentEndElementIndex; + //__shared__ uint32_t elem_handled_A, elem_handled_B, max_A, max_B, restart; + //__shared__ float lastExpected; + __shared__ int tempOffset, tempData, workavailable, consumedwork; + + // get block data + if (threadIdx.x < 2) + { + block_start_end[threadIdx.x] = startingIdsA[blockIdx.x + threadIdx.x]; + //smem.A_row_ids[0] = static_cast(-1); + //currentEndElementIndex = completion_status[blockIdx.x]; + //lastExpected = 0.0f; + + // if we stopped globally, dont even start, otherwise consider restart + //if (threadIdx.x == 0 && completion_status[blockIdx.x] != 0 && completion_status[blockIdx.x] != 0xFFFFFFFF) + // printf("%d restarting with %x %d\n", blockIdx.x, completion_status[blockIdx.x], completion_status[blockIdx.x] & (~0x80000000)); + smem.chunk_pointer_position = 0; + smem.directChunkRows = 0; + smem.runflag = *run_flag != 0 ? 0xFFFFFFFF : completion_status[blockIdx.x]; + smem.chunk_counter = chunk_counter[blockIdx.x]; + + // for consume based restart, set consumedwork too + consumedwork = (smem.runflag & 0x80000000) == 0 ? smem.runflag : 0; + } + + smem.A_row_ids.clear(); + + __syncthreads(); + if (smem.runflag == std::numeric_limits::max()) + return; + + int worknnz = min(NNZ_PER_BLOCK, nnz - blockIdx.x * NNZ_PER_BLOCK); + + // Assign column ids of a + //TODO: adjust num threads per row either dynamic (could be always pow 2) or a few preset static ones + for (uint32_t r = block_start_end[0] + threadIdx.x; r <= block_start_end[1]; r += THREADS) + { + int ain = static_cast(offsetsA[r] - blockIdx.x * NNZ_PER_BLOCK); + int bin = offsetsA[min(rows, r + 1)] - blockIdx.x * NNZ_PER_BLOCK; + + int a = max(0, ain); + int b = min(static_cast(worknnz), bin); + + //iterate over all threads that start with that row + if (a < b) + { + smem.A_row_ids.storeReference(a, r); + int ra = a; + smem.A_row_ids.storeRow(a, ra, r); + for (++a; a < b; ++a) + smem.A_row_ids.storeRow(a, ra, r); + } + } + + __syncthreads(); + + bool directChunkRows = false; + int workToDistribute[NNZ_PER_THREAD]; + + // Read out lengths of rows from B for each element from A + #pragma unroll + for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i) + { + uint32_t w = threadIdx.x + i * THREADS; + INDEX_TYPE a_col = 0; + uint32_t b_num = 0; + + + if (w < worknnz) + { + // normal case or work element based restart + bool load = true; + + if(load) + { + uint32_t l = w + blockIdx.x * NNZ_PER_BLOCK; + a_col = indicesA[l]; + b_num = offsetsB[a_col + 1] - offsetsB[a_col]; + + smem.A_col_ids[w] = indicesA[l]; + smem.A_indata[w] = valA[l]; + + // Long rows are directly referred to the merge stage by only writing an identifier chunck info + if (b_num >= LongRowCutOff) + { + // remember that we are now deadling with a dirct chunk row, which needs sorting + b_num = b_num | 0x80000000; + directChunkRows = true; + } + else if ((smem.runflag & 0x80000000) != 0) + { + // row based restart needs to set the consumed work too + uint32_t to_start_row = smem.A_row_ids.restartRowDecode((smem.runflag & (~0x80000000)), block_start_end[0]); + if (smem.A_row_ids.getEncodedRow(w) < to_start_row) + { + //printf("%d %d load %x\n", blockIdx.x, threadIdx.x, completion_status[blockIdx.x]); + atomicAdd(&consumedwork, b_num); + b_num = 0; + } + } + } + } + workToDistribute[i] = b_num; + } + + // move all direct chunk rows to the front so we can quickly identify them later + if (__syncthreads_or(directChunkRows)) + { + // only write out during first run + if (smem.runflag == 0) + { + uint32_t chunkoff[NNZ_PER_THREAD]; + bool success = true; + + #pragma unroll + for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i) + { + // alloc special chunk and write out + if ((workToDistribute[i] & 0x80000000) != 0) + { + //FIXME: This is the wrong typez + // printf("%d %d allocating direct chunk for size %d\n", blockIdx.x, threadIdx.x, (workToDistribute[i] & (~0x80000000))); + if (!allocDirectChunk(chunk_alloc, chunk_size, chunkoff[i])) + { + success = false; + atomicOr(run_flag, 0x1); + } + atomicAdd(&(smem.chunk_pointer_position), 1); + } + } + if (__syncthreads_or(!success)) + { + //re start with old state and alloc all chunks in next run + return; + } + + if (threadIdx.x == 0) + { + uint32_t num_chunks = smem.chunk_pointer_position; + smem.chunk_pointer_position = atomicAdd(chunk_pointer_alloc, num_chunks); + if (smem.chunk_pointer_position + num_chunks >= chunk_pointer_sizes) + { + success = false; + atomicOr(run_flag, 0x2); + if(smem.chunk_pointer_position < chunk_pointer_sizes) + *chunk_pointer_pos = smem.chunk_pointer_position; + } + } + if (__syncthreads_or(!success)) + { + //re start with old state and alloc all chunks in next run + return; + } + + #pragma unroll + for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i) + { + if ((workToDistribute[i] & 0x80000000) != 0) + { + // printf("%d %d added DirectChunk for row \n", blockIdx.x, threadIdx.x); + + // write chunk data + DirectChunk * p_chunk = DirectChunk::cast(chunks, chunkoff[i]); + chunks_pointers[atomicAdd(&(smem.chunk_pointer_position), 1)] = reinterpret_cast(p_chunk); + + uint32_t w = threadIdx.x + i * THREADS; + auto encodedRow = smem.A_row_ids.getEncodedRow(w); + INDEX_TYPE r = smem.A_row_ids.decodeRow(encodedRow); + INDEX_TYPE a_col = smem.A_col_ids[w]; + uint32_t b_num = workToDistribute[i] & (~0x80000000); + DirectChunk::place(chunks, chunkoff[i], b_num, r, indicesB + offsetsB[a_col], valB + offsetsB[a_col], smem.A_indata[w], (static_cast(blockIdx.x) << ChunkSortingBits) | (threadIdx.x + i*THREADS + NNZ_PER_BLOCK)); + addPotentiallySharedRow(r, p_chunk, true, output_row_list_heads, shared_rows_tracker, shared_rows_alloc, true); + + // if ((r == 0)) + // printf("We have a direct chunk in row: %u with %u elements with col: %u\n", r, b_num, a_col); + + atomicAdd(output_row_chunk_count + r, 1); + // mark so we do not go through simple merge + if (INPUT_ELEMENTS_PER_THREAD * THREADS * MERGE_MAX_PATH_OPTIONS >= b_num) + { + // Set both top most bits if this can go to max chunks case + atomicOr(output_row_chunk_count + r, MAX_CHUNKS_CASE); + } + else + { + // Only set the topmost bit if this should go to the generalized case + atomicOr(output_row_chunk_count + r, GENERALIZED_CASE); + } + + + //no need to set count, as we will go through max or general merge anyway + // atomicAdd(output_row_count + r, CombineElements * THREADS); + + } + } + } + + + #pragma unroll + for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i) + smem.temp_work_storage_single[threadIdx.x + i * THREADS] = ((workToDistribute[i] & 0x80000000) != 0) ? 0xFFFFFFFF : workToDistribute[i]; + __syncthreads(); + + // run a prefix sum to figure out where to place the direct chunk row ids and others + uint32_t direct[NNZ_PER_THREAD], nonDirect[NNZ_PER_THREAD]; + for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i) + { + // note stripped layout + if (smem.temp_work_storage_single[threadIdx.x * NNZ_PER_THREAD + i] == 0xFFFFFFFF) + { + direct[i] = 1; + nonDirect[i] = 0; + } + else + { + direct[i] = 0; + nonDirect[i] = 1; + } + } + uint32_t sum_direct; + SimpleScan(smem.directChunkScanTempMem).ExclusiveSum(direct, direct, sum_direct); + SimpleScan(smem.nonDirectChunkScanTempMem).ExclusiveSum(nonDirect, nonDirect); + + INDEX_TYPE a_col[NNZ_PER_THREAD]; + VALUE_TYPE1 a_vals[NNZ_PER_THREAD]; + typename ARowStorage::EncodedRowType a_rowIds[NNZ_PER_THREAD]; + + //fetch the data + #pragma unroll + for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i) + { + // note stripped layout + int r = threadIdx.x * NNZ_PER_THREAD + i; + a_col[i] = smem.A_col_ids[r]; + a_vals[i] = smem.A_indata[r]; + a_rowIds[i] = smem.A_row_ids.getEncodedRow(r); + workToDistribute[i] = smem.temp_work_storage_single[r]; + } + __syncthreads(); + + //store shuffled and cleared workload + #pragma unroll + for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i) + { + + // note stripped layout + uint32_t p = nonDirect[i] + sum_direct; + if (workToDistribute[i] == 0xFFFFFFFF) + { + workToDistribute[i] = 0; + p = direct[i]; + } + + smem.A_col_ids[p] = a_col[i]; + smem.A_indata[p] = a_vals[i]; + smem.A_row_ids.storeEncodedRow(p, a_rowIds[i]); + smem.temp_work_storage_single[p] = workToDistribute[i]; + } + + smem.directChunkRows = sum_direct; + __syncthreads(); + + // load new work + #pragma unroll + for (uint32_t i = 0; i < NNZ_PER_THREAD; ++i) + workToDistribute[i] = smem.temp_work_storage_single[threadIdx.x * NNZ_PER_THREAD + i]; + + // Initialize the work distribution from stripped layout + RowelementWorkDistribution:: template initialize(smem.workdistributionMem, smem.workdistributionTempMem, workToDistribute); + } + else + { + // Initialize the work distribution from blocked layoyt and run while work is available + RowelementWorkDistribution:: template initialize(smem.workdistributionMem, smem.workdistributionTempMem, workToDistribute); + } + + // now kept in shared + tempData = 0; + tempOffset = 0; + + + // comsume based restart + if (smem.runflag != 0 && (smem.runflag & 0x80000000) == 0) + { + RowelementWorkDistribution::removework(smem.workdistributionMem, smem.runflag); + } + + + // note: potential race condition with removework, however the entire work will never be removed and we only compare with > 0 -> fine? + // TODO: -> we can remove syncthreads!? + __syncthreads(); + + workavailable = RowelementWorkDistribution::workAvailable(smem.workdistributionMem); + while (workavailable > 0) + { + + int localAEntry[CombineElements]; + int elementB[CombineElements]; + + int elements = RowelementWorkDistribution:: template assignWorkAllThreads( + smem.workdistributionMem, smem.workdistributionTempMem, smem.workdistributionTempMemOutFull, + localAEntry, elementB, CombineElements*THREADS - tempData); + + if(threadIdx.x == 0) + consumedwork += CombineElements*THREADS - tempData; + + + typename ARowStorage::EncodedRowType temp_row[CombineElements]; + INDEX_TYPE temp_col_id[CombineElements]; + typename SEMIRING_t::output_t temp_val[CombineElements]; + + smem.minCol = smem.minRow = std::numeric_limits::max(); + smem.maxCol = smem.maxRow = 0; + + // locel min/max row and col + INDEX_TYPE minRow = std::numeric_limits::max(), maxRow = 0; + INDEX_TYPE minCol = std::numeric_limits::max(), maxCol = 0; + + + + //fetch B data and set MIN/MAX values for how many rows in A and how many cols and B are touched + #pragma unroll + for (int i = 0; i < CombineElements; ++i) + { + if (i < elements) + { + uint32_t aentry = localAEntry[i]; + uint32_t fetch_row = smem.A_col_ids[aentry]; + temp_row[i] = smem.A_row_ids.getEncodedRow(aentry); + minRow = min(minRow, temp_row[i]); + maxRow = max(maxRow, temp_row[i]); + + + //if (elementB[i] < 0 || aentry >= worknnz) + // printf("%d %d [%d]: max %d - nnz: %d - req: %d/%d - %d %d\n", blockIdx.x, threadIdx.x, i, elements, worknnz, CombineElements*THREADS - tempData, workavailable, localAEntry[i], elementB[i]); + + INDEX_TYPE elb = offsetsB[fetch_row] + elementB[i]; + temp_col_id[i] = indicesB[elb]; + temp_val[i] = semiring.multiply(smem.A_indata[aentry], valB[elb]); + + minCol = min(minCol, temp_col_id[i]); + maxCol = max(maxCol, temp_col_id[i]); + } + else + { + // get from last iteration + int t = i * THREADS + threadIdx.x - (CombineElements*THREADS - tempData); + if (t >= 0) + { + int access = (tempOffset + t) % TEMP_ITEMS_PER_BLOCK; + // offset tells us where the last row data is currently placed + temp_row[i] = smem.current_row_ids[access]; + temp_col_id[i] = smem.current_col_ids[access]; + temp_val[i] = smem.current_output[access]; + + // printf("%d %d (%d %d %d): %d %d %f\n", blockIdx.x, threadIdx.x, access, t, tempData, temp_row[i], temp_col_id[i], temp_val[i]); + + minRow = min(minRow, temp_row[i]); + maxRow = max(maxRow, temp_row[i]); + + minCol = min(minCol, temp_col_id[i]); + maxCol = max(maxCol, temp_col_id[i]); + + //dummy value to indicate that we have something + elementB[i] = 1; + } + else + //indicate that we are empty + elementB[i] = -1; + } + } + + // + updateMinValue(smem.minCol, minCol); + updateMinValue(smem.minRow, minRow); + updateMaxValue(smem.maxCol, maxCol); + updateMaxValue(smem.maxRow, maxRow); + + __syncthreads(); + + + INDEX_TYPE colRange = smem.maxCol - smem.minCol; + INDEX_TYPE rowRange = smem.maxRow - smem.minRow + 1; + INDEX_TYPE colBits = 32 - __clz(colRange); + INDEX_TYPE rowBits = 32 - __clz(rowRange); + + + if (colBits + rowBits > 32 && threadIdx.x == 0) + { + printf("colRange: %u rowRange: %u colBits: %u rowBits: %u | minCol: %u maxCol: %u | minRow: %u maxRow: %u\n", colRange, rowRange, colBits, rowBits, smem.minCol, smem.maxCol, smem.minRow, smem.maxRow); + //return; + } + + ScanCombinerEntry combinedEntries[CombineElements]; + { + //TODO: if there are fewer items only, we want to only sort those... + //TODO: if we can use uint32_t instead of uint64_t we want to use that... + SortType combIndex[CombineElements]; + typename SEMIRING_t::output_t data[CombineElements]; + #pragma unroll + for (int i = 0; i < CombineElements; ++i) + { + if (elementB[i] >= 0) + { + combIndex[i] = (static_cast(temp_row[i] - smem.minRow) << colBits) | (temp_col_id[i] - smem.minCol); + data[i] = temp_val[i]; + } + else + { + combIndex[i] = ~SortType(0); + data[i] = SEMIRING_t::AdditiveIdentity(); + } + } + + tempData = SortAndCombiner::combine(smem.sAndCMem, combIndex, data, combinedEntries, + [](auto a, auto b) { + return a == b; + }, + [colBits](auto a, auto b) { + return (a >> colBits) == (b >> colBits); + }, semiring, + colBits + rowBits); + + } + + + workavailable = RowelementWorkDistribution::workAvailable(smem.workdistributionMem); + + //we would like to know how many elements we have from the last row + // TODO: check if that is right + #pragma unroll + for (int i = 0; i < CombineElements; ++i) + if (combinedEntries[i].isRowend() && combinedEntries[i].memoffset() == tempData - 1) + smem.lastRowCount = combinedEntries[i].rowcount(); + + __syncthreads(); + + // if (threadIdx.x == 0) + // printf("%d decision to make: %d >= %d || !%d || 8 * %d < %d\n", blockIdx.x, tempData, TEMP_ITEMS_PER_BLOCK, workavailable, smem.lastRowCount, tempData); + + // TODO: check heuristic + // if we must go out or if the last row is very small in comparison to the other data + if (tempData >= TEMP_ITEMS_PER_BLOCK || workavailable <= 0 || 1* smem.lastRowCount < tempData) + { + // keep the last row around if we can so we reduce the amount of merging we have to perform + int allocData = workavailable > 0 && smem.lastRowCount < TEMP_ITEMS_PER_BLOCK ? tempData - smem.lastRowCount : tempData; + + // determine how many chunks we need to generate (additional ones for single row chunks in between) + bool multiChunk = false; + + if (smem.directChunkRows != 0) + { + for (uint32_t i = threadIdx.x; i < smem.directChunkRows; i += THREADS) + multiChunk = multiChunk || (smem.minRow < smem.A_row_ids.getEncodedRow(i) && smem.A_row_ids.getEncodedRow(i) < smem.maxRow); + + multiChunk = __syncthreads_or(multiChunk); + } + + + if (multiChunk) + { + // we need to separate the output into multiple chunks + //if (threadIdx.x == 0 && (smem.maxRow == 7094 || smem.maxRow == 6025 || smem.maxRow == 5086 || smem.maxRow == 5273 || smem.maxRow == 7350)) + // printf("%d %d split chunk for %d-%d .. %d %d\n", blockIdx.x, threadIdx.x, smem.minRow, smem.maxRow, allocData, tempData); + + // init smem + smem.brokenChunkOffsetStart = 0; + smem.minBrokenChunkRow = smem.minRow; + smem.maxBrokenChunkRow = smem.maxRow; + + + // determine individual chunk ends + // iterate over shared rows list and my data to see how many chunk boundaries i need to add + // need access to the next element -> store in shared + smem.current_row_ids[threadIdx.x+1] = (combinedEntries[CombineElements-1].index >> colBits) + smem.minRow; + smem.current_row_ids[0] = smem.minRow; + + __syncthreads(); + uint32_t chunk_splitting_row_id = 0; + uint32_t chunk_splitting_row = smem.A_row_ids.getEncodedRow(chunk_splitting_row_id); + typename ARowStorage::EncodedRowType r = smem.current_row_ids[threadIdx.x]; + // search for the first chunk breaking row that is larger than the row handled by the previous thread + // ie find the first chunk breaking row that can be relevant for my entries + while (chunk_splitting_row <= r) + { + if (++chunk_splitting_row_id < smem.directChunkRows) + { + chunk_splitting_row = smem.A_row_ids.getEncodedRow(chunk_splitting_row_id); + } + else + { + // this threads entries are above all chunk breaking rows, so set it to max + chunk_splitting_row = smem.maxRow + 1; + break; + } + } + + //if (threadIdx.x == 0 && smem.maxRow == 7094) + //{ + // printf("Min: %u and Max: %u\n", smem.minRow, smem.maxRow); + //} + + //if (/*threadIdx.x == 0 &&*/ smem.maxRow == 7094) + //{ + // printf("Chunk splitting row: %u with r: %u\n", chunk_splitting_row, r); + //} + + // determine where to break + static_assert(CombineElements <= 32, "can handle a maximum of 32 CombinedElements when performing multi chunk out"); + uint32_t chunk_breaks = 0; + #pragma unroll + for (int i = 0; i < CombineElements; ++i) + { + typename ARowStorage::EncodedRowType next_r = min(static_cast((combinedEntries[i].index >> colBits)) + smem.minRow, smem.maxRow); + /*if (smem.maxRow == 7094 && r < 7100 && next_r < 7100 && r != next_r) + { + printf("Row given: %u | %u nextrow\n", r, next_r); + }*/ + /*if (r != next_r && chunk_splitting_row <= next_r && (next_r == smem.maxRow) && chunk_splitting_row != smem.maxRow) + { + printf("R: %u | next_R: %u | chunk_splitting: %u | max: %u ----- directid: %u maxid: %u\n", r, next_r, chunk_splitting_row, smem.maxRow, chunk_splitting_row_id, smem.directChunkRows); + }*/ + /*if (r != next_r && chunk_splitting_row <= next_r && next_r != smem.maxRow)*/ + /*if (r != next_r && chunk_splitting_row <= next_r && chunk_splitting_row != smem.maxRow && tempData == allocData)*/ + /*if (r != next_r && chunk_splitting_row <= next_r && (next_r != smem.maxRow || next_r == 7094 || next_r == 6025))*/ + if (r != next_r && chunk_splitting_row <= next_r && (next_r != smem.maxRow || (chunk_splitting_row != smem.maxRow && tempData == allocData))) + { + // we are at a chunk boundary + chunk_breaks |= (1 << i); + //if(smem.maxRow == 7094) + // printf("%d %d breaks chunk between %d %d\n", blockIdx.x, threadIdx.x, r, next_r); + // find next + do + { + if (++chunk_splitting_row_id < smem.directChunkRows) + chunk_splitting_row = smem.A_row_ids.getEncodedRow(chunk_splitting_row_id); + else + { + chunk_splitting_row = smem.maxRow + 1; + break; + } + } while (chunk_splitting_row <= next_r); + } + r = next_r; + } + + // run prefix sum to figure out how many chunk breaks to insert + int num_broken_chunks[1] = { __popc(chunk_breaks) }; + int overall_broken_chunk, my_starting_offset[1]; + SimpleIntScan(smem.intScanTempMem).ExclusiveSum(num_broken_chunks, my_starting_offset, overall_broken_chunk); + + + //if (threadIdx.x == 0 && smem.maxRow == 7094) + // printf("%d %d overall broken chunks: %d\n", blockIdx.x, threadIdx.x, overall_broken_chunk); + + // iterate over broken up chunks and write out in the typical manner + for (int c = 0; c <= overall_broken_chunk; ++c) + { + __syncthreads(); + int local_chunk = c - my_starting_offset[0]; + if (local_chunk >= 0 && local_chunk < num_broken_chunks[0]) + { + // it is our chunk - extract + int handled_bits = 0; + #pragma unroll + for (int i = 0; i < CombineElements; ++i) + { + if ((chunk_breaks & (1 << i)) != 0) + { + if (handled_bits == local_chunk) + { + if (combinedEntries[i].isResult()) + smem.brokenChunkOffsetEnd = combinedEntries[i].memoffset(); + else + smem.brokenChunkOffsetEnd = combinedEntries[i].memoffset() + 1; + //printf("%d %d its my chunk time %d: %d\n", blockIdx.x, threadIdx.x, i, combinedEntries[i].memoffset()); + } + ++handled_bits; + } + } + } + __syncthreads(); + + if(threadIdx.x == 0) + { + if (c == overall_broken_chunk) + { + // need to setup last chunk + smem.brokenChunkOffsetEnd = smem.brokenChunkOffsetStart + tempData; + //printf("%d %d its last chunk time: %d\n", blockIdx.x, threadIdx.x, smem.brokenChunkOffsetStart + tempData); + } + /*if (threadIdx.x == 0 && smem.maxRow == 1878) + printf("We have allocData %u and other %u\n", allocData, smem.brokenChunkOffsetEnd - smem.brokenChunkOffsetStart);*/ + uint32_t chunkoff = completeChunkAlloc(min(smem.brokenChunkOffsetEnd - smem.brokenChunkOffsetStart, allocData), chunks, chunk_alloc, chunk_size, chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, chunk_pointer_pos, + [&]() + { + atomicOr(run_flag, 0x1); + //if(threadIdx.x == 0) + + // Write out descriptor for restart into global + //printf("%d going for restart: %x: %d -> %d -- block row range: %d<->%d\n", blockIdx.x, smem.runflag, (smem.runflag&(~0x80000000)), smem.A_row_ids.decodeRow(smem.A_row_ids.restartRowDecode((smem.runflag & (~0x80000000)), block_start_end[0])), block_start_end[0], block_start_end[1]); + completion_status[blockIdx.x] = smem.runflag; + chunk_counter[blockIdx.x] = smem.chunk_counter; + }, [&]() + { + atomicOr(run_flag, 0x2); + // Write out descriptor for restart into global + //if(threadIdx.x == 0) + //printf("%d going for restart: %x: %d -> %d -- block row range: %d<->%d\n", blockIdx.x, smem.runflag, (smem.runflag&(~0x80000000)), smem.A_row_ids.decodeRow(smem.A_row_ids.restartRowDecode((smem.runflag & (~0x80000000)), block_start_end[0])), block_start_end[0], block_start_end[1]); + completion_status[blockIdx.x] = smem.runflag; + chunk_counter[blockIdx.x] = smem.chunk_counter; + }); + smem.chunkStartOffset = chunkoff; + } + + __syncthreads(); + if (smem.chunkStartOffset == 0xFFFFFFFF) + return; + + smem.firstRowCount = 0; + + int num = min(smem.brokenChunkOffsetEnd - smem.brokenChunkOffsetStart, allocData); + + allocData -= num; + + // write data for this chunk to smem and write out + for (uint32_t written = smem.brokenChunkOffsetStart; written < smem.brokenChunkOffsetEnd; written += TEMP_ITEMS_PER_BLOCK) + { + //store in shared for coalesced out + #pragma unroll + for (int i = 0; i < CombineElements; ++i) + { + uint32_t poffset = combinedEntries[i].memoffset(); + if (combinedEntries[i].isResult() && poffset >= written && poffset < written + TEMP_ITEMS_PER_BLOCK) + { + uint32_t pwrite = poffset - written; + INDEX_TYPE col = (combinedEntries[i].index & ((1u << colBits) - 1)) + smem.minCol; + typename ARowStorage::EncodedRowType row = (combinedEntries[i].index >> colBits) + smem.minRow; + smem.current_col_ids[pwrite] = col; + smem.current_row_ids[pwrite] = row; + smem.current_output[pwrite] = combinedEntries[i].value; + smem.rowCounts[pwrite] = combinedEntries[i].isRowend() ? combinedEntries[i].rowcount() : 0; + } + } + + __syncthreads(); + + #pragma unroll + for (int i = 0; i < RETAIN_ELEMENTS_PER_THREAD; ++i) + { + //write out + INDEX_TYPE rid; + int writeout = written + i * THREADS + threadIdx.x - smem.brokenChunkOffsetStart; + if (writeout < num) + { + typename SEMIRING_t::output_t* valstart = Chunk::cast(chunks, smem.chunkStartOffset)->values_direct(num); + INDEX_TYPE* indexstart = Chunk::cast(chunks, smem.chunkStartOffset)->indices_direct(num); + valstart[writeout] = smem.current_output[i * THREADS + threadIdx.x]; + indexstart[writeout] = smem.current_col_ids[i * THREADS + threadIdx.x]; + rid = smem.current_row_ids[i * THREADS + threadIdx.x]; + // if (rid >= rows) { + // printf("%d %d rid bad row read %d \n",blockIdx.x, threadIdx.x , rid); + // } + if (smem.A_row_ids.decodeRow(rid) == 1878) + { + /*if(smem.current_col_ids[i * THREADS + threadIdx.x] == 0) + printf("ChunkStartOffset: %u with num: %u\n", smem.chunkStartOffset, num); + printf("Row %u: %u\n", smem.A_row_ids.decodeRow(rid), smem.current_col_ids[i * THREADS + threadIdx.x]);*/ + } + if (writeout == num - 1) + { + smem.maxBrokenChunkRow = rid; + smem.lastRowCount = smem.rowCounts[i * THREADS + threadIdx.x]; + } + } + else + rid = std::numeric_limits::max(); + + uint32_t rcount = smem.rowCounts[i * THREADS + threadIdx.x]; + if (rcount != 0 && rid != std::numeric_limits::max()) + { + //write row count + if (smem.firstRowCount == 0 && rid == smem.current_row_ids[0]) + { + smem.minBrokenChunkRow = rid; + smem.firstRowCount = rcount; + } + if ((smem.A_row_ids.decodeRow(rid) == 1878) /*|| (smem.A_row_ids.decodeRow(rid) == 11614) || (smem.A_row_ids.decodeRow(rid) == 14759) || (smem.A_row_ids.decodeRow(rid) == 14767) || (smem.A_row_ids.decodeRow(rid) == 11125)*/) + printf("Adding count: %u to row %u\n", rcount, (smem.A_row_ids.decodeRow(rid))); + atomicAdd(output_row_count + smem.A_row_ids.decodeRow(rid), rcount); + } + } + __syncthreads(); + } + + // last is shared if we are in a broken chunk (allocData > 0) or if we write out the last completely + bool shared_last = (allocData > 0 || tempData == num) && smem.minBrokenChunkRow != smem.maxBrokenChunkRow; + if (threadIdx.x < (shared_last ? 2 : 1)) + { + + //write header + /*if(smem.A_row_ids.decodeRow(smem.minBrokenChunkRow) <= 2605 && smem.A_row_ids.decodeRow(smem.maxBrokenChunkRow) >= 2605)*/ + /*printf("%d %d broken writing header: %d<->%d .%d %d. (%d/%d/%d)\n", blockIdx.x, threadIdx.x, + smem.A_row_ids.decodeRow(smem.minBrokenChunkRow), smem.A_row_ids.decodeRow(smem.maxBrokenChunkRow), smem.firstRowCount, smem.lastRowCount, allocData, num, tempData);*/ + + Chunk::place(chunks, smem.chunkStartOffset, num, smem.A_row_ids.decodeRow(smem.minBrokenChunkRow), smem.firstRowCount, smem.lastRowCount, (static_cast(blockIdx.x) << ChunkSortingBits) | (smem.chunk_counter + threadIdx.x)); + + bool minrow = threadIdx.x == 0 && smem.minBrokenChunkRow != smem.maxBrokenChunkRow; + uint32_t r = smem.A_row_ids.decodeRow(minrow ? smem.minBrokenChunkRow : smem.maxBrokenChunkRow); + Chunk* c = Chunk::cast(chunks, smem.chunkStartOffset); + + /*printf("%d %d adding shared row: %d first: %d - for encoded rows %d %d\n", blockIdx.x, threadIdx.x, r, minrow, smem.minBrokenChunkRow, smem.maxBrokenChunkRow);*/ + addPotentiallySharedRow(r, c, minrow, output_row_list_heads, shared_rows_tracker, shared_rows_alloc); + atomicAdd(output_row_chunk_count + r, 1); + + // set new local restart information + smem.runflag = tempData == num ? consumedwork : (0x80000000 | (smem.A_row_ids.restartRowEncode(smem.maxBrokenChunkRow, block_start_end[0]) + 1)); + + //printf("%d %d updating tempData %d -= %d -> %d and temp offset: %d\n", blockIdx.x, threadIdx.x, tempData, num, tempData - num, num % TEMP_ITEMS_PER_BLOCK); + + smem.brokenChunkOffsetStart = smem.brokenChunkOffsetEnd; + + //reset count + tempData = tempData - num; + tempOffset = num % TEMP_ITEMS_PER_BLOCK; + if (threadIdx.x == 0) + smem.chunk_counter += (shared_last ? 2 : 1); + } + } + } + else + { + //if (threadIdx.x == 0) + // printf("%d %d normal chunk for %d-%d\n", blockIdx.x, threadIdx.x, smem.minRow, smem.maxRow); + if (threadIdx.x == 0) + { + uint32_t chunkoff = completeChunkAlloc(allocData, chunks, chunk_alloc, chunk_size, chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, chunk_pointer_pos, + [&]() + { + atomicOr(run_flag, 0x1); + // Write out descriptor for restart into global + completion_status[blockIdx.x] = smem.runflag; + chunk_counter[blockIdx.x] = smem.chunk_counter; + }, + [&]() + { + atomicOr(run_flag, 0x2); + // Write out descriptor for restart into global + completion_status[blockIdx.x] = smem.runflag; + chunk_counter[blockIdx.x] = smem.chunk_counter; + }); + + smem.chunkStartOffset = chunkoff; + } + __syncthreads(); + if (smem.chunkStartOffset == 0xFFFFFFFF) + return; + + // every first element in row -> run prefix sum to determine number of entries in row + // not first or last, directly set count + // first and last row for potential overlap + // atomicMax at count + // if 0 before -> alloc list element and atomic exchange with head and write info + next pointer into list + // if the head was non-zero (second list element, add shared row entry: + // atomicAdd for alloc and write row + // : add first row in chunk to beginning of chunk + // add numentires to chunk + // add offset to data and column ids to chunk info + // this info can be updated for shared rows when we extract stuff :) + + smem.firstRowCount = 0; + //RowCounter rc(smem.rowcounterMem); + + for (uint32_t written = 0; written < tempData; written += TEMP_ITEMS_PER_BLOCK) + { + //store in shared for coalesced out + #pragma unroll + for (int i = 0; i < CombineElements; ++i) + { + uint32_t poffset = combinedEntries[i].memoffset(); + if (combinedEntries[i].isResult() && poffset >= written && poffset < written + TEMP_ITEMS_PER_BLOCK) + { + uint32_t pwrite = poffset - written; + INDEX_TYPE col = (combinedEntries[i].index & ((1u << colBits) - 1)) + smem.minCol; + typename ARowStorage::EncodedRowType row = (combinedEntries[i].index >> colBits) + smem.minRow; + //if (col > 21198119) + // printf("%d %d merge fucked up col: %d: %llx %d+d\n", blockIdx.x, threadIdx.x, col, combinedEntries[i].index, uint32_t(combinedEntries[i].index & ((1u << colBits) - 1)), smem.minCol); + smem.current_col_ids[pwrite] = col; + smem.current_row_ids[pwrite] = row; + smem.current_output[pwrite] = combinedEntries[i].value; + + //printf("%d %d entry %d: %d/%d %f\n", blockIdx.x, threadIdx.x, poffset, row, col, combinedEntries[i].value); + + /*if (col < smem.minCol || col > smem.maxCol || row < smem.minRow || row > smem.maxRow || row >= rows || col >= rows) + { + printf("%d %d bad entry: %llx %d = %d + %d (%d %d) %d (%d %d) - %d\n", blockIdx.x, threadIdx.x, combinedEntries[i].index, row, smem.minRow, (combinedEntries[i].index >> colBits), smem.minRow, smem.maxRow, col, smem.minCol, smem.maxCol, rows); + __trap(); + }*/ + + smem.rowCounts[pwrite] = combinedEntries[i].isRowend() ? combinedEntries[i].rowcount() : 0; + } + } + + __syncthreads(); + + #pragma unroll + for (int i = 0; i < RETAIN_ELEMENTS_PER_THREAD; ++i) + { + //write out + INDEX_TYPE rid; + uint32_t writeout = written + i * THREADS + threadIdx.x; + if (writeout < allocData) + { + typename SEMIRING_t::output_t* valstart = Chunk::cast(chunks, smem.chunkStartOffset)->values_direct(allocData); + INDEX_TYPE* indexstart = Chunk::cast(chunks, smem.chunkStartOffset)->indices_direct(allocData); + valstart[writeout] = smem.current_output[i * THREADS + threadIdx.x]; + indexstart[writeout] = smem.current_col_ids[i * THREADS + threadIdx.x]; + rid = smem.current_row_ids[i * THREADS + threadIdx.x]; + //printf("row id %d", smem.current_row_ids[i * THREADS + threadIdx.x]); + //fixme? + // if ((rid >= rows || rid < 0) && rid != std::numeric_limits::max() ) + // printf("%d %d fffffffffffitting rid: %d %d allocdata: %d, %d\n", blockIdx.x, threadIdx.x, rid, rows,allocData, std::numeric_limits::max() - rid ); + + } + else + { + rid = std::numeric_limits::max(); + //fixme: suspicious if theres an error, I thought I discarded these changes + // if ((rid >= rows || rid < 0) && rid != std::numeric_limits::max() ) + // printf("%d %d Eeeeeeeeeeeeeenonfitting rid: %d %d allocdata: %d\n", blockIdx.x, threadIdx.x, rid, rows,allocData); + } + + uint32_t rcount = smem.rowCounts[i * THREADS + threadIdx.x]; + if (rcount != 0 && rid < rows) + { + //write row count + //if (written + threadIdx.x == tempData - 1) + // smem.lastRowCount = rcount; + if (smem.firstRowCount == 0 && rid == smem.current_row_ids[0]) + smem.firstRowCount = rcount; + + // if (rid >= rows || rid < 0) + // printf("%d %d nonfitting rid: %d %d allocdata: %d\n", blockIdx.x, threadIdx.x, rid, rows,allocData); + auto b = smem.A_row_ids.decodeRow(rid); + atomicAdd(output_row_count + b, rcount); + } + } + __syncthreads(); + } + + bool shared_last = tempData == allocData && smem.minRow != smem.maxRow; + if (threadIdx.x < (shared_last ? 2 : 1)) + { + ////write header + //if (smem.A_row_ids.decodeRow(smem.minRow) >= 2605 && smem.A_row_ids.decodeRow(smem.maxRow) <= 2605) + //printf("%d %d writing header: %d<->%d .%d %d. (%d/%d)\n", blockIdx.x, threadIdx.x, + // smem.A_row_ids.decodeRow(smem.minRow), smem.A_row_ids.decodeRow(smem.maxRow), smem.firstRowCount, smem.lastRowCount, allocData, tempData); + Chunk::place(chunks, smem.chunkStartOffset, allocData, smem.A_row_ids.decodeRow(smem.minRow), smem.firstRowCount, smem.lastRowCount, (static_cast(blockIdx.x) << ChunkSortingBits) | (smem.chunk_counter + threadIdx.x)); + + + bool minrow = threadIdx.x == 0 && smem.minRow != smem.maxRow; + uint32_t r = smem.A_row_ids.decodeRow(minrow ? smem.minRow : smem.maxRow); + Chunk* c = Chunk::cast(chunks, smem.chunkStartOffset); + + //printf("%6d %4d adding shared row: %6d first: %d with %5d \n", blockIdx.x, threadIdx.x, r, minrow, minrow ? smem.firstRowCount : smem.lastRowCount); + addPotentiallySharedRow(r, c, minrow, output_row_list_heads, shared_rows_tracker, shared_rows_alloc); + atomicAdd(output_row_chunk_count + r, 1); + + // set new local restart information + smem.runflag = tempData == allocData ? consumedwork : (0x80000000 | (smem.A_row_ids.restartRowEncode(smem.maxRow, block_start_end[0]))); + + //printf("%d %d setting temp run flag to: %d == %d ? %d : (0x80000000 | %d) - %d -> %x %d\n", blockIdx.x, threadIdx.x, tempData, allocData, consumedwork, (smem.maxRow - block_start_end[0]), smem.maxRow, smem.runflag, smem.runflag & (~0x80000000)); + + //reset count + tempData = tempData - allocData; + tempOffset = allocData % TEMP_ITEMS_PER_BLOCK; + if (threadIdx.x == 0) + smem.chunk_counter += (shared_last ? 2 : 1); + } + } + } + else + { + // directly store to shared + #pragma unroll + for (int i = 0; i < CombineElements; ++i) + { + if (combinedEntries[i].isResult()) + { + uint32_t poffset = combinedEntries[i].memoffset(); + smem.current_col_ids[poffset] = (combinedEntries[i].index & ((1u << colBits) - 1)) + smem.minCol; + smem.current_row_ids[poffset] = (combinedEntries[i].index >> colBits) + smem.minRow; + smem.current_output[poffset] = combinedEntries[i].value; + } + } + + //if (threadIdx.x == 0) + // printf("%d keep: %d->%d %d\n", blockIdx.x, smem.minRow, smem.maxRow, tempData); + tempOffset = 0; + } + __syncthreads(); + } + + if (threadIdx.x == 0) + { + // All done + completion_status[blockIdx.x] = 0xFFFFFFFF; + } +} + + +template + void AcSpGEMMKernels::h_computeSpgemmPart( + const typename SEMIRING_t::leftInput_t* valA, const INDEX_TYPE* indicesA, const OFFSET_TYPE* __restrict offsetsA, + /*fixme const T2 -> */const typename SEMIRING_t::rightInput_t* __restrict valB, const INDEX_TYPE* __restrict indicesB, const OFFSET_TYPE* __restrict offsetsB, + const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows, + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size, + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, + OFFSET_TYPE* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count, + uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv, + uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos, SEMIRING_t semiring) +{ + HANDLE_ERROR(cudaGetLastError()); + + computeSpgemmPart< NNZ_PER_THREAD, THREADS, BLOCKS_PER_MP, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_PATH_OPTIONS, typename SEMIRING_t::leftInput_t, typename SEMIRING_t::rightInput_t, typename SEMIRING_t::output_t, INDEX_TYPE, OFFSET_TYPE, SORT_TYPE_MODE, T, U, Label,SEMIRING_t> <<>> + (valA, indicesA, offsetsA, valB, indicesB, offsetsB, startingIdsA, nnz, rows, chunks, chunk_alloc, chunk_worst_case, chunk_size, + chunks_pointers, chunk_pointer_alloc, chunk_pointer_sizes, output_row_count, output_row_list_heads, output_row_chunk_count, + shared_rows_tracker, shared_rows_alloc, expected_row_overlap, expected_row_overlap_inv, run_flag, completion_status, chunk_counter, chunk_pointer_pos, semiring); + HANDLE_ERROR(cudaGetLastError()); + +} + + +#define GPUCompressedMatrixMatrixMultiplyGEMM(TYPE, THREADS, BLOCKS_PER_MP, NNZPERTHREAD, INPUT_ELEMENTS_PER_THREAD, RETAIN_ELEMENTS_PER_THREAD, MERGE_MAX_CHUNKS, MERGE_MAX_PATH_OPTIONS) \ + template void h_computeSpgemmPart \ + (const TYPE* valA, const uint32_t* indicesA, const uint32_t* __restrict offsetsA, \ + const TYPE* __restrict valB, const uint32_t* __restrict indicesB, const uint32_t* __restrict offsetsB, \ + const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows,\ + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size, \ + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \ + uint32_t* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count,\ + uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv, \ + uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos); \ + template void h_computeSpgemmPart \ + (const TYPE* valA, const uint32_t* indicesA, const uint32_t* __restrict offsetsA, \ + const TYPE* __restrict valB, const uint32_t* __restrict indicesB, const uint32_t* __restrict offsetsB, \ + const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows, \ + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size, \ + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \ + uint32_t* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count, \ + uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv, \ + uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos); \ + template void h_computeSpgemmPart \ + (const TYPE* valA, const uint32_t* indicesA, const uint32_t* __restrict offsetsA, \ + const TYPE* __restrict valB, const uint32_t* __restrict indicesB, const uint32_t* __restrict offsetsB, \ + const uint32_t* __restrict startingIdsA, uint32_t nnz, uint32_t rows, \ + uint32_t* chunks, uint32_t* chunk_alloc, uint32_t* chunk_worst_case, uint32_t chunk_size, \ + void** chunks_pointers, uint32_t* chunk_pointer_alloc, uint32_t chunk_pointer_sizes, \ + uint32_t* output_row_count, void** output_row_list_heads, uint32_t* output_row_chunk_count, \ + uint32_t* shared_rows_tracker, uint32_t* shared_rows_alloc, float expected_row_overlap, float expected_row_overlap_inv, \ + uint32_t* run_flag, uint32_t* completion_status, uint32_t* chunk_counter, uint32_t* chunk_pointer_pos); diff --git a/include/GALATIC/include/device/consistent_gpu_memory.h b/include/GALATIC/include/device/consistent_gpu_memory.h new file mode 100644 index 00000000..78392d90 --- /dev/null +++ b/include/GALATIC/include/device/consistent_gpu_memory.h @@ -0,0 +1,93 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include +#include "../../include/devicetools/memory.h" +#include "../memory_space.h" +#include "../consistent_memory.h" + +namespace ACSpGEMM { + template<> + class ConsistentMemory : RegisteredMemory + { + size_t _size; + CU::unique_ptr _ptr; + + size_t clear() override + { + auto s = _size; + reset(0); + return s; + } + public: + ConsistentMemory() : _size(0) + { + register_consistent_memory(this); + } + + ~ConsistentMemory() + { + unregister_consistent_memory(this); + } + + operator CUdeviceptr() const noexcept { return _ptr; } + + template + T* get() const noexcept { return reinterpret_cast(_ptr.operator long long unsigned int()); } + + void increaseMemRetainData(size_t size) + { + CU::unique_ptr tmp_ptr = CU::allocMemory(_size + size); + cudaMemcpy(tmp_ptr.get(), _ptr.get(), _size, cudaMemcpyDeviceToDevice); + _ptr.reset(); + _ptr = std::move(tmp_ptr); + _size += size; + } + + void assure(size_t size) + { + if (size > _size) + { + _ptr.reset(); + _ptr = CU::allocMemory(size); + _size = size; + } + } + void reset(size_t size = 0) + { + _ptr.reset(); + _size = 0; + assure(size); + } + }; +} diff --git a/include/GALATIC/include/devicetools/consistent_memory.h b/include/GALATIC/include/devicetools/consistent_memory.h new file mode 100644 index 00000000..6a2c30df --- /dev/null +++ b/include/GALATIC/include/devicetools/consistent_memory.h @@ -0,0 +1,112 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include +#include "CUDATools/memory_space.h" + +namespace HiSparse +{ + namespace Detail + { + class RegisteredMemory + { + public: + virtual size_t clear() = 0; + }; + + std::vector& getRegMemories() + { + static std::vector m; + return m; + } + + void register_consistent_memory(RegisteredMemory* memory) + { + getRegMemories().push_back(memory); + } + void unregister_consistent_memory(RegisteredMemory* memory) + { + auto &m = getRegMemories(); + std::remove(begin(m), end(m), memory); + } + size_t clear_consistentMemory() + { + size_t s = 0; + for (auto m : getRegMemories()) + s += m->clear(); + return s; + } + + template + class ConsistentMemory; + + template + class RegisteredMemoryVar : RegisteredMemory + { + T v; + size_t clear() override + { + v = 0; + return 0; + } + public: + RegisteredMemoryVar() : v(0) + { + register_consistent_memory(this); + } + explicit RegisteredMemoryVar(T v) : v(v) + { + register_consistent_memory(this); + } + ~RegisteredMemoryVar() + { + unregister_consistent_memory(this); + } + + RegisteredMemoryVar& operator+= (T add) + { + v += add; + return *this; + } + + void operator = (T other) + { + v = other; + } + operator T() const noexcept + { + return v; + } + }; + } +} diff --git a/include/GALATIC/include/devicetools/error.h b/include/GALATIC/include/devicetools/error.h new file mode 100644 index 00000000..803e2922 --- /dev/null +++ b/include/GALATIC/include/devicetools/error.h @@ -0,0 +1,297 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + + +#ifndef INCLUDED_CUDA_ERROR +#define INCLUDED_CUDA_ERROR + +#pragma once + +#include +#include + +#include + + +namespace CU +{ + template + struct error_traits; + + template + class basic_error : public error_traits::category + { + public: + virtual CUresult code() const noexcept override; + virtual const char* name() const noexcept override; + const char* what() const noexcept override; + }; + + + class error : public std::exception + { + public: + virtual CUresult code() const noexcept = 0; + virtual const char* name() const noexcept = 0; + + using invalid_value = basic_error; + using out_of_memory = basic_error; + using not_initialized = basic_error; + using deinitialized = basic_error; + using profiler_disabled = basic_error; + //using profiler_not_initialized = basic_error; + //using profiler_already_started = basic_error; + //using profiler_already_stopped = basic_error; + using no_device = basic_error; + using invalid_device = basic_error; + using invalid_image = basic_error; + using invalid_context = basic_error; + //using context_already_current = basic_error; + using map_failed = basic_error; + using unmap_failed = basic_error; + using array_is_mapped = basic_error; + using already_mapped = basic_error; + using no_binary_for_gpu = basic_error; + using already_acquired = basic_error; + using not_mapped = basic_error; + using not_mapped_as_array = basic_error; + using not_mapped_as_pointer = basic_error; + using ecc_uncorrectable = basic_error; + using unsupported_limit = basic_error; + using context_already_in_use = basic_error; + using peer_access_unsupported = basic_error; + using invalid_ptx = basic_error; + using invalid_graphics_context = basic_error; + using nvlink_uncorrectable = basic_error; + using jit_not_found = basic_error; + using invalid_source = basic_error; + using file_not_found = basic_error; + using shared_object_symbol_not_found = basic_error; + using shared_object_init_failed = basic_error; + using operating_system = basic_error; + using invalid_handle = basic_error; + using not_found = basic_error; + using not_ready = basic_error; + using illegal_address = basic_error; + using launch_out_of_resources = basic_error; + using launch_timeout = basic_error; + using launch_incompatible_texturing = basic_error; + using peer_access_already_enabled = basic_error; + using peer_access_not_enabled = basic_error; + using primary_context_active = basic_error; + using context_is_destroyed = basic_error; + using assertion_failed = basic_error; + using too_many_peers = basic_error; + using host_memory_already_registered = basic_error; + using host_memory_not_registered = basic_error; + using hardware_stack_error = basic_error; + using illegal_instruction = basic_error; + using misaligned_address = basic_error; + using invalid_address_space = basic_error; + using invalid_pc = basic_error; + using launch_failed = basic_error; + using cooperative_launch_too_large = basic_error; + using not_permitted = basic_error; + using not_supported = basic_error; + using unknown = basic_error; + }; + + class logic_error : public error {}; + class runtime_error : public error {}; + class fatal_error : public runtime_error {}; + class bad_alloc : public error {}; + + + template + struct error_traits + { + using category = logic_error; + }; + + template + struct error_traits> + { + using category = runtime_error; + }; + + template + struct error_traits> + { + using category = fatal_error; + }; + + template + struct error_traits> + { + using category = bad_alloc; + }; + + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + extern template class basic_error; + + + class unknown_error_code : public error + { + CUresult error_code; + + public: + unknown_error_code(CUresult error_code); + + CUresult code() const noexcept override; + const char* name() const noexcept override; + const char* what() const noexcept override; + }; + + + class unexpected_result : public error + { + CUresult result; + + public: + unexpected_result(CUresult result); + + CUresult code() const noexcept override; + const char* name() const noexcept override; + const char* what() const noexcept override; + }; + + + CUresult throw_error(CUresult result); + + inline CUresult succeed(CUresult result) + { + if (result != CUDA_SUCCESS) + throw unknown_error_code(throw_error(result)); + return result; + } + + + template + inline CUresult expect(CUresult result) + { + if (result != expected) + throw unexpected_result(result); + return result; + } + + template + inline CUresult expect(CUresult result) + { + if (result != expected_1) + return expect(result); + return result; + } +} + +using CU::throw_error; +using CU::succeed; +using CU::expect; + +#endif // INCLUDED_CUDA_ERROR diff --git a/include/GALATIC/include/devicetools/event.h b/include/GALATIC/include/devicetools/event.h new file mode 100644 index 00000000..faceecdc --- /dev/null +++ b/include/GALATIC/include/devicetools/event.h @@ -0,0 +1,58 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + + +#ifndef INCLUDED_CUDA_EVENT +#define INCLUDED_CUDA_EVENT + +#pragma once + +#include + +#include "unique_handle.h" + + +namespace CU +{ + struct EventDestroyDeleter + { + void operator ()(CUevent event) const + { + cuEventDestroy(event); + } + }; + + using unique_event = unique_handle; + + unique_event createEvent(unsigned int flags = CU_EVENT_DEFAULT); +} + +#endif // INCLUDED_CUDA_EVENT diff --git a/include/GALATIC/include/devicetools/memory.h b/include/GALATIC/include/devicetools/memory.h new file mode 100644 index 00000000..c5a7c13b --- /dev/null +++ b/include/GALATIC/include/devicetools/memory.h @@ -0,0 +1,95 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + + +#ifndef INCLUDED_CUDA_MEMORY +#define INCLUDED_CUDA_MEMORY + +#pragma once + +#include + +#include + +#include "../../include/devicetools/unique_handle.h" + + +namespace CU +{ + struct MemFreeDeleter + { + void operator ()(CUdeviceptr ptr) const + { + cudaFree(reinterpret_cast(ptr)); + } + }; + + using unique_ptr = unique_handle; + + + struct pitched_memory + { + pitched_memory(const pitched_memory&) = delete; + pitched_memory& operator =(const pitched_memory&) = delete; + + unique_ptr memory; + std::size_t pitch; + + pitched_memory() {} + + pitched_memory(unique_ptr memory, std::size_t pitch) + : memory(std::move(memory)), + pitch(pitch) + { + } + + pitched_memory(pitched_memory&& m) + : memory(std::move(m.memory)), + pitch(m.pitch) + { + } + + pitched_memory& operator =(pitched_memory&& m) + { + using std::swap; + swap(memory, m.memory); + pitch = m.pitch; + return *this; + } + }; + + + unique_ptr allocMemory(std::size_t size); + unique_ptr allocMemoryPitched(std::size_t& pitch, std::size_t row_size, std::size_t num_rows, unsigned int element_size); + pitched_memory allocMemoryPitched(std::size_t row_size, std::size_t num_rows, unsigned int element_size); +} + +#endif // INCLUDED_CUDA_MEMORY diff --git a/include/GALATIC/include/devicetools/memory_space.h b/include/GALATIC/include/devicetools/memory_space.h new file mode 100644 index 00000000..3e5aeb4d --- /dev/null +++ b/include/GALATIC/include/devicetools/memory_space.h @@ -0,0 +1,41 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +namespace HiSparse +{ + enum class MemorySpace + { + host, + device + }; +} \ No newline at end of file diff --git a/include/GALATIC/include/devicetools/stream.h b/include/GALATIC/include/devicetools/stream.h new file mode 100644 index 00000000..f570457c --- /dev/null +++ b/include/GALATIC/include/devicetools/stream.h @@ -0,0 +1,59 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + + +#ifndef INCLUDED_CUDA_STREAM +#define INCLUDED_CUDA_STREAM + +#pragma once + +#include + +#include "../../include/devicetools/unique_handle.h" + + + +namespace CU +{ + struct StreamDestroyDeleter + { + void operator ()(CUstream stream) const + { + cuStreamDestroy(stream); + } + }; + + using unique_stream = unique_handle; + + unique_stream createStream(unsigned int flags = CU_STREAM_DEFAULT); +} + +#endif // INCLUDED_CUDA_STREAM diff --git a/include/GALATIC/include/devicetools/unique_handle.h b/include/GALATIC/include/devicetools/unique_handle.h new file mode 100644 index 00000000..32ef72a2 --- /dev/null +++ b/include/GALATIC/include/devicetools/unique_handle.h @@ -0,0 +1,132 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + + +#ifndef INCLUDED_CUDA_UNIQUE_HANDLE +#define INCLUDED_CUDA_UNIQUE_HANDLE + +#pragma once + +#include + + +namespace CU +{ + template + class unique_handle : Deleter + { + T h; + + void free(T handle) noexcept + { + if (handle != NULL_VALUE) + Deleter::operator ()(handle); + } + + public: + unique_handle(const unique_handle&) = delete; + unique_handle& operator =(const unique_handle&) = delete; + + using handle_type = T; + using deleter_type = Deleter; + + static constexpr T null_value = NULL_VALUE; + + explicit unique_handle(T handle = NULL_VALUE) noexcept + : h(handle) + { + } + + void consume(T handle) noexcept { h = handle; } + + + unique_handle(T handle, const Deleter& d) noexcept + : Deleter(d), + h(handle) + { + } + + unique_handle(T handle, Deleter&& d) noexcept + : Deleter(std::move(d)), + h(handle) + { + } + + unique_handle(unique_handle&& h) noexcept + : Deleter(std::move(static_cast(h))), + h(h.h) + { + h.h = NULL_VALUE; + } + + ~unique_handle() + { + free(h); + } + + operator T() const noexcept { return h; } + + template + DataType* get() const noexcept { return reinterpret_cast(h); } + + template + DataType* getRelease() noexcept { DataType* tmp = reinterpret_cast(h); h = 0ULL; return tmp; } + + unique_handle& operator =(unique_handle&& h) noexcept + { + using std::swap; + swap(*this, h); + return *this; + } + + T release() noexcept + { + T temp = h; + h = NULL_VALUE; + return temp; + } + + void reset(T handle = null_value) noexcept + { + using std::swap; + swap(this->h, handle); + free(handle); + } + + friend void swap(unique_handle& a, unique_handle& b) noexcept + { + using std::swap; + swap(a.h, b.h); + } + }; +} + +#endif // INCLUDED_CUDA_UNIQUE_HANDLE diff --git a/include/GALATIC/include/execution_stats.h b/include/GALATIC/include/execution_stats.h new file mode 100644 index 00000000..4fb6df11 --- /dev/null +++ b/include/GALATIC/include/execution_stats.h @@ -0,0 +1,159 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#include +#include + +struct ExecutionStats +{ + //timings + bool measure_all; + float duration; + float duration_blockstarts; + float duration_spgemm; + float duration_merge_case_computation; + float duration_merge_simple; + float duration_merge_max; + float duration_merge_generalized; + float duration_write_csr; + + + //merge cases + uint32_t shared_rows; + uint32_t simple_mergers; + uint32_t simple_rows; + uint32_t complex_rows; + uint32_t generalized_rows; + + //memory consumption + size_t mem_allocated_chunks; + size_t mem_used_chunks; + size_t mem_clear_return; + + //misc + size_t restarts; + int called{ 0 }; + friend std::ostream& operator<<(std::ostream&, const ExecutionStats&); + + ExecutionStats() : measure_all(false), + duration(0), duration_blockstarts(0), duration_spgemm(0), duration_merge_case_computation(0), + duration_merge_simple(0), duration_merge_max(0), duration_merge_generalized(0), duration_write_csr(0), + shared_rows(0), simple_mergers(0), simple_rows(0), complex_rows(0), generalized_rows(0), + mem_allocated_chunks(0), mem_used_chunks(), mem_clear_return(0), + restarts(0) { } + + ExecutionStats& operator+=(const ExecutionStats& stats) + { + this->duration += stats.duration; + this->duration_blockstarts += stats.duration_blockstarts; + this->duration_spgemm += stats.duration_spgemm; + this->duration_merge_case_computation += stats.duration_merge_case_computation; + this->duration_merge_simple += stats.duration_merge_simple; + this->duration_merge_max += stats.duration_merge_max; + this->duration_merge_generalized += stats.duration_merge_generalized; + this->duration_write_csr += stats.duration_write_csr; + this->shared_rows += stats.shared_rows; + this->simple_mergers += stats.simple_mergers; + this->simple_rows += stats.simple_rows; + this->complex_rows += stats.complex_rows; + this->generalized_rows += stats.generalized_rows; + this->mem_allocated_chunks += stats.mem_allocated_chunks; + this->mem_used_chunks += stats.mem_used_chunks; + this->mem_clear_return += stats.mem_clear_return; + this->restarts += stats.restarts; + ++called; + // printf("Overall: %f and added up: %f\n", stats.duration, (stats.duration_blockstarts + stats.duration_spgemm + stats.duration_merge_case_computation + + // stats.duration_merge_simple + stats.duration_merge_max + stats.duration_merge_generalized + stats.duration_write_csr)); + return *this; + } + + void reset() + { + this->duration = 0.0f; + this->duration_blockstarts = 0.0f; + this->duration_spgemm = 0.0f; + this->duration_merge_case_computation = 0.0f; + this->duration_merge_simple = 0.0f; + this->duration_merge_max = 0.0f; + this->duration_merge_generalized = 0.0f; + this->duration_write_csr = 0.0f; + this->shared_rows = 0; + this->simple_mergers = 0; + this->simple_rows = 0; + this->complex_rows = 0; + this->generalized_rows = 0; + this->mem_allocated_chunks = 0; + this->mem_used_chunks = 0; + this->mem_clear_return = 0; + this->restarts = 0; + } + + void normalize() + { + if (called) + { + float division_factor = static_cast(called); + this->duration /= division_factor; + this->duration_blockstarts /= division_factor; + this->duration_spgemm /= division_factor; + this->duration_merge_case_computation /= division_factor; + this->duration_merge_simple /= division_factor; + this->duration_merge_max /= division_factor; + this->duration_merge_generalized /= division_factor; + this->duration_write_csr /= division_factor; + this->shared_rows /= called; + this->simple_mergers /= called; + this->simple_rows /= called; + this->complex_rows /= called; + this->generalized_rows /= called; + this->mem_allocated_chunks /= called; + this->mem_used_chunks /= called; + this->mem_clear_return /= called; + this->restarts /= called; + } + } +}; + +inline std::ostream& operator<<(std::ostream& os, const ExecutionStats& obj) { + os << "Overall Duration: " << obj.duration << " ms\n"; + os << "Restarts: " << obj.restarts << std::endl; + if (obj.measure_all) + { + os << "Sum individual timings: " << obj.duration_blockstarts + obj.duration_spgemm + obj.duration_merge_case_computation + obj.duration_merge_simple + obj.duration_merge_max + obj.duration_merge_generalized + obj.duration_write_csr << " ms\n"; + os << std::string("Duration BlockStarts: ") << obj.duration_blockstarts << " ms | Duration SpGEMM: " << obj.duration_spgemm << " ms\n"; + os << "Duration MergeCase: " << obj.duration_merge_case_computation << " ms | Duration Merge Simple: " << obj.duration_merge_simple << " ms\n"; + os << "Duration Merge Max: " << obj.duration_merge_max << " ms | Duration Merge Generalized: " << obj.duration_merge_generalized << " ms\n"; + os << "Duration Merge Write CSR: " << obj.duration_write_csr << " ms\n"; + } + return os; +} diff --git a/include/GALATIC/include/memory_space.h b/include/GALATIC/include/memory_space.h new file mode 100644 index 00000000..86e4816b --- /dev/null +++ b/include/GALATIC/include/memory_space.h @@ -0,0 +1,39 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + + +enum class MemorySpace +{ + host, + device +}; diff --git a/include/GALATIC/include/meta_utils.h b/include/GALATIC/include/meta_utils.h new file mode 100644 index 00000000..623759d3 --- /dev/null +++ b/include/GALATIC/include/meta_utils.h @@ -0,0 +1,274 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + + +#ifndef INCLUDED_HIS_META_UTILS +#define INCLUDED_HIS_META_UTILS + +#pragma once + +#include +#include +#include "multi_arch_build.h" + + + using std::enable_if; + using std::declval; + using std::is_empty; + using std::conditional; + + template + struct type_match + { + static const bool value = false; + }; + + template + struct type_match + { + static const bool value = true; + }; + + template + struct static_divup + { + static const int value = (X + Y - 1) / Y; + }; + + template + struct static_popcnt + { + static const int value = ((X & 0x1) + static_popcnt< (X >> 1) >::value); + }; + template<> + struct static_popcnt<0> + { + static const int value = 0; + }; + + template + struct static_clz + { + static const int value = (X & 0x80000000) ? Completed : static_clz< (X << 1), Completed + 1 >::value; + }; + template + struct static_clz + { + static const int value = 32; + }; + + template + struct static_max; + + template + struct static_max + { + static const int value = VALUE; + }; + + template + struct static_max + { + static const int next_value = static_max::value; + static const int value = VALUE > next_value ? VALUE : next_value; + }; + + template + struct static_min; + + template + struct static_min + { + static const int value = VALUE; + }; + + template + struct static_min + { + static const int next_value = static_min::value; + static const int value = VALUE < next_value ? VALUE : next_value; + }; + + template + struct choose; + + template + struct choose + { + typedef typename choose::type type; + }; + template + struct choose<0, NC, NCS...> + { + typedef NC type; + }; + + + template + struct conditional_eval; + + template<> + struct conditional_eval + { + template + DUAL_BUILD_FUNCTION static void eval(F f) + { + f(); + } + }; + template<> + struct conditional_eval + { + template + DUAL_BUILD_FUNCTION static void eval(F f) + { + } + }; + + template class CONSUMER, int V, int END, int STEP, bool DONE, int... VALUES> + struct static_for_impl + { + using type = typename static_for_impl < CONSUMER, V+STEP, END, STEP, (V + STEP < END), VALUES..., V>::type; + }; + template class CONSUMER, int V, int END, int STEP, int... VALUES> + struct static_for_impl + { + using type = CONSUMER ; + }; + + template class CONSUMER, int END, int BEGIN = 0, int STEP = 1> + struct static_for + { + using type = typename static_for_impl < CONSUMER, BEGIN, END, STEP, (BEGIN < END)>::type; + }; + + + template + struct type_list { }; + + template class APPLIER, class COMBLIST, class... TYPELISTS> + struct apply_list_impl; + template class APPLIER, class... DONETYPES, class... NEWTYPES, class... REMTYPELISTS> + struct apply_list_impl, type_list, REMTYPELISTS...> + { + using type = typename apply_list_impl, REMTYPELISTS...>::type; + }; + template class APPLIER, class... DONETYPES> + struct apply_list_impl> + { + using type = APPLIER; + }; + template class APPLIER, class... TYPELISTS> + struct apply_list + { + using type = typename apply_list_impl, TYPELISTS... >::type; + }; + + template + struct inverse_list_impl; + template + struct inverse_list_impl, type_list> + { + using type = typename inverse_list_impl, type_list>::type; + }; + template + struct inverse_list_impl> + { + using type = INVERSE_LIST; + }; + template + struct inverse_list + { + using type = typename inverse_list_impl, TYPELIST>::type; + }; + + + template + struct sequence { }; + + template class APPLIER, class SEQUENCE> + struct apply_sequence; + template class APPLIER, int... NUMS> + struct apply_sequence> + { + using type = APPLIER; + }; + + template + struct select_from_impl; + template + struct select_from_impl, sequence> + { + using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence, sequence > ::type; + }; + template + struct select_from_impl, sequence> + { + using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence, sequence > ::type; + }; + template + struct select_from_impl, sequence<>> + { + using type = sequence; + }; + template + struct select_from + { + using type = typename select_from_impl <(MASK >> 1U), MASK & 0x1, sequence<>, SEQUENCE > ::type; + }; + + + template class LOGICAL, class SEQUENCE> + struct sequence_any; + template class LOGICAL, int NUM, int...NUMS> + struct sequence_any > + { + static const bool value = LOGICAL::value || sequence_any>::value; + }; + template class LOGICAL> + struct sequence_any > + { + static const bool value = false; + }; + + template + struct static_is_zero + { + static const bool value = false; + }; + template<> + struct static_is_zero<0> + { + static const bool value = true; + }; + + +#endif //INCLUDED_HIS_META_UTILS diff --git a/include/GALATIC/include/multi_arch_build.h b/include/GALATIC/include/multi_arch_build.h new file mode 100644 index 00000000..ba5f8747 --- /dev/null +++ b/include/GALATIC/include/multi_arch_build.h @@ -0,0 +1,45 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#pragma once + +#ifdef __CUDACC__ +#define DUAL_BUILD_FUNCTION __host__ __device__ +#else +#define DUAL_BUILD_FUNCTION +#endif + +#ifndef __CUDA_ARCH__ +inline float __uint_as_float(unsigned t) +{ + return *reinterpret_cast(&t); +} +#endif diff --git a/include/GALATIC/include/performTestCase.cu b/include/GALATIC/include/performTestCase.cu new file mode 100644 index 00000000..b614c302 --- /dev/null +++ b/include/GALATIC/include/performTestCase.cu @@ -0,0 +1,1019 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ +* performTestCase.cpp +* +* ac-SpGEMM +* +* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter +*------------------------------------------------------------------------------ +*/ + +// Global includes +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include +//surpress crash notification windows (close or debug program window) +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#else +#include +#endif + +// Local includes +#include "dCSR.cuh" + +#include "CSR.cuh" +#include "COO.cuh" +#include "Vector.h" +#include "dVector.h" +#include "device/Multiply.cuh" +#include "Transpose.h" +#include "Compare.cuh" +#include "consistent_memory.h" +#include "CustomExceptions.h" +#include "SemiRingInterface.h" + +#ifdef _WIN32 +#include +#include +#include +using namespace std::filesystem; +#else +#include +using namespace std::experimental::filesystem; +#endif + +// CuSparse include +#include "cusparse/include/cuSparseMultiply.h" + +// // Nsparse include +// #include "nsparse/include/nsparseMultiply.h" + +// // RMerge include +// #include "RMerge/include/rmergeMultiply.h" + +// // BhSparse include +// #include"bhSparse/include/bhSparseMultiply.h" +struct canonical {}; + +struct testdouble : SemiRing, { + static double multiply(double& a, double& b) { return a * b; } + static double add(double & a, double & b) { return a + b; } + + static double MultiplicativeIdentity() { + return 1; + } + static double AdditiveIdentity() { + return 0; + } +}; + + +unsigned int padding = 0; +template +std::string typeext(); +template<> +std::string typeext() +{ + return std::string(""); +} +template<> +std::string typeext() +{ + return std::string("d_"); +} + +template +std::string nameextension() +{ + return ""; +} +template<> +std::string nameextension() +{ + return "_d"; +} +template<> +std::string nameextension() +{ + return "_f"; +} + +template +bool isFloat() +{ + return false; +} + +template<> +bool isFloat() +{ + return true; +} + +// ################################################################# +// +uint32_t numTrailingBinaryZeros(uint32_t n) +{ + uint32_t mask = 1; + for (uint32_t i = 0; i < 32; i++, mask <<= 1) + if ((n & mask) != 0) + return i; + + return 32; +} + +// ################################################################# +// +void writeDetailedInfo(const ExecutionStats& stats, std::ofstream& out) +{ + out << stats.shared_rows << ";"; + out << stats.simple_rows << ";"; + out << stats.simple_mergers << ";"; + out << stats.complex_rows << ";"; + out << stats.generalized_rows << ";"; + out << stats.duration << ";"; + out << stats.duration_blockstarts << ";"; + out << stats.duration_spgemm << ";"; + out << stats.duration_merge_case_computation << ";"; + out << stats.duration_merge_simple << ";"; + out << stats.duration_merge_max << ";"; + out << stats.duration_merge_generalized << ";"; + out << stats.duration_write_csr << ";"; + out << stats.mem_clear_return << ";"; + out << stats.mem_allocated_chunks << ";"; + out << stats.mem_used_chunks << ";"; + out << stats.restarts << ";"; + out << std::endl; +} + +// ################################################################# +// +void getNextMatrix(const char* foldername, const std::string& lastname, std::string& nextname) +{ + bool found_last = false; + directory_iterator it{ foldername }; + for (; it != directory_iterator{}; ++it) + { + if (!is_regular_file(*it)) + continue; + if (it->path().extension() != ".mtx") + continue; + if (!found_last) + { + if (it->path().filename() != lastname) + continue; + else + { + found_last = true; + continue; + } + } + else + { + nextname = it->path().filename().string(); + return; + } + } + nextname = std::string(""); + return; +} + +// ################################################################# +// +std::string getColumnHeaders(uint32_t approaches, std::string prefix = "") +{ + std::string headers(prefix); + + if (approaches & (0x1 << 0)) + headers.append("cuSparse;"); + if (approaches & (0x1 << 1)) + headers.append("acSpGEMM;"); + // if (approaches & (0x1 << 2)) + // headers.append("nsparse;"); + // if (approaches & (0x1 << 3)) + // headers.append("RMerge;"); + // if (approaches & (0x1 << 4)) + // headers.append("bhSparse;"); + + headers.append("\n"); + + return headers; +} + +// ################################################################# +// +template +void writeMatrixStats(CSR& mat, const std::string matname, std::ofstream& outfs) +{ + typename CSR::Statistics stats = mat.rowStatistics(); + //"\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max; + outfs << matname << ";" << mat.rows << ";" << mat.cols << ";" << mat.nnz << ";" + << stats.mean << ";" << stats.std_dev << ";" << stats.min << ";" << stats.max << ";"; +} + +// ################################################################# +// +template +size_t countFloatingPointOperations(CSR& matA, CSR& matB) +{ + size_t count = 0; + for (auto nnzAiter = 0; nnzAiter < matA.nnz; ++nnzAiter) + count += matB.row_offsets[matA.col_ids[nnzAiter] + 1] - matB.row_offsets[matA.col_ids[nnzAiter]]; + return count; +} + +// ################################################################# +// +std::ostream& writeGPUInfo(std::ostream& file) +{ + int cudaDevice; + cudaGetDevice(&cudaDevice); + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, cudaDevice); + std::cout << "Going to use " << prop.name << " " << prop.major << "." << prop.minor << "\n"; + + file << "name;cc;num_multiprocessors;warp_size;max_threads_per_mp;regs_per_mp;shared_memory_per_mp;total_constant_memory;total_global_memory;clock_rate;max_threads_per_block;max_regs_per_block;max_shared_memory_per_block\n" + << prop.name << ';' + << prop.major << '.' + << prop.minor << ';' + << prop.multiProcessorCount << ';' + << prop.warpSize<< ';' + << prop.maxThreadsPerMultiProcessor << ';' + << prop.regsPerMultiprocessor << ';' + << prop.sharedMemPerMultiprocessor << ';' + << prop.totalConstMem << ';' + << prop.totalGlobalMem << ';' + << prop.clockRate * 1000 << ';' + << prop.maxThreadsPerBlock << ';' + << prop.regsPerBlock << ';' + << prop.sharedMemPerBlock + << std::endl; + return file; +} + +// ################################################################# +// +template +int performSpGEMMTests(int argc, char ** argv) +{ + std::string name_extension = ""; + + bool runtests = true; + if (argc > 2) + runtests = std::string(argv[2]) != "0"; + + int cudaDevice = 0; + if (argc > 3) + cudaDevice = std::atoi(argv[3]); + + bool continue_run = true; + // if (argc > 4) + // continue_run = std::string(argv[4]) != "0"; + + std::vector trait_init = { 256, 3, 2, 4, 4, 16, 256, 8 }; + if (argc > 5) + { + + std::istringstream traitstream(argv[5]); + std::vector input_trait_init; + std::string val; + while (std::getline(traitstream, val, ',')) + input_trait_init.push_back(std::stoi(val)); + + if (input_trait_init.size() != trait_init.size()) + printf("Malformed trait init input param; %zu params required; fallback to default\n", trait_init.size()); + else + trait_init = input_trait_init; + } + + uint32_t approach_selector = 0xFFFFFFFF; + uint32_t first_approach = 0; + if (argc > 6) + { + approach_selector = std::stoi(argv[6]); + first_approach = numTrailingBinaryZeros(approach_selector); + if (approach_selector == 0) + { + printf("ERROR: No approaches selected for testing\n"); + return 0; + } + } + + cudaSetDevice(cudaDevice); + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, cudaDevice); + std::cout << "Going to use " << prop.name << " " << prop.major << "." << prop.minor << "\n"; + std::string gpuname = prop.name; + + GPUMatrixMatrixMultiplyTraits DefaultTraits(trait_init[0], trait_init[1], trait_init[2], trait_init[3], trait_init[4], trait_init[5], trait_init[6], trait_init[7]); + DefaultTraits.preferLoadBalancing = true; + + std::ofstream results; + std::ofstream mem_consumption; + std::ofstream ours_detailed; + std::ofstream stateout; + std::ofstream statsout; //This will go horribly wrong: stateout vs statsout + std::string trait_string = + std::to_string(trait_init[0]) + + "_" + std::to_string(trait_init[1]) + + "_" + std::to_string(trait_init[2]) + + "_" + std::to_string(trait_init[3]) + + "_" + std::to_string(trait_init[4]) + + "_" + std::to_string(trait_init[5]) + + "_" + std::to_string(trait_init[6]) + + "_" + std::to_string(trait_init[7]) + "_"; + std::string statefile = std::string("perf_") + trait_string + nameextension() + name_extension + gpuname + ".state"; + std::string lastname; + std::string current_name; + unsigned num_approaches = 6; + unsigned current_approach = first_approach; + bool finished_write = true; + bool fresh_file = !continue_run; + if (continue_run) + { + std::ifstream last(statefile.c_str()); + if (last) + std::getline(last, lastname); + + if (last && !lastname.empty()) + { + current_name = lastname; + std::cout << "Continuing run after " << lastname << std::endl; + results.open((std::string("perf_") + trait_string + nameextension() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app); + mem_consumption.open((std::string("mem_") + trait_string + nameextension() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app); + ours_detailed.open((std::string("detailed_") + trait_string + nameextension() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app); + statsout.open("matrix_stats.csv", std::ios_base::app); + std::cout << "After open" << std::endl; + + + std::time_t now = std::time(NULL); + std::tm * ptm = std::localtime(&now); + char buffer[32]; + // Format: Mo, 15.06.2009 20:20:00 + std::strftime(buffer, 32, "%a, %d.%m.%Y %H:%M:%S", ptm); + std::cout << buffer << std::endl; + + std::string lastapproach; + std::getline(last, lastapproach); + current_approach = (std::stoi(lastapproach) + 1) % num_approaches; + std::string finished_write_string; + std::getline(last, finished_write_string); + finished_write = !finished_write_string.empty(); + + if (!finished_write) + { + results << -3 << ";"; + mem_consumption << -3 << ";"; + finished_write = true; + } + + last.close(); + + if (!(approach_selector & (0x1 << current_approach))) + { + //this limits us to 31 approaches :-p + uint32_t next_offset = numTrailingBinaryZeros((approach_selector & 0xEFFFFFFF) >> current_approach); + if (next_offset < sizeof(uint32_t) * 8) + { + current_approach += next_offset; + } + else + { + current_approach = first_approach; + + results << std::endl; + mem_consumption << std::endl; + + const char *foldername = argc == 1 ? "." : argv[1]; + getNextMatrix(foldername, lastname, current_name); + + if (current_name.empty()) + { + return 0; + } + + } + } + else if (current_approach < std::stoi(lastapproach)) + { + const char *foldername = argc == 1 ? "." : argv[1]; + getNextMatrix(foldername, lastname, current_name); + + if (current_name.empty()) + { + return 0; + } + + results << std::endl; + mem_consumption << std::endl; + + if (current_name.empty()) + { + return 0; + } + } + } + else + { + fresh_file = true; + } + last.close(); + stateout.open(statefile.c_str()); + } + + if (fresh_file) + { + + results.open((std::string("perf_") + trait_string + nameextension() + name_extension + gpuname + ".csv").c_str()); + results << "\"sep=;\"\n"; + writeGPUInfo(results); + results << getColumnHeaders(approach_selector, "\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max;Products;"); + + mem_consumption.open((std::string("mem_") + trait_string + nameextension() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app); + mem_consumption << "\"sep=;\"\n"; + writeGPUInfo(mem_consumption); + mem_consumption << getColumnHeaders(approach_selector & 14, "\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max;"); + mem_consumption << std::endl; + + ours_detailed.open((std::string("detailed_") + trait_string + nameextension() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app); + ours_detailed << "\"sep=;\"\n"; + writeGPUInfo(ours_detailed); + ours_detailed << std::string("\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max;num_shared_rows;simple_rows;simple_mergers;complex_mergers;generalized_mergers;") + + std::string("duration;duration_blockstarts;duration_spgemm;duration_merge_case_computation;duration_merge_simple;duration_merge_max;duration_merge_generalized;duration_write_csr;") + + std::string("clear_return;chunk_alloc;chunk_used;restarts;\n"); + ours_detailed << std::endl; + + statsout.open("matrix_stats.csv", std::ios_base::app); + statsout << "\"sep=;\"\n"; + statsout << "\nMatrix; rows; cols; nnz; r_mean; r_std_dev; r_min; r_max;" << std::endl; + } + + + CSR csrmat, csrmat2, result_mat; + + char *foldername; + if (argc == 1) + { + foldername = const_cast("."); + } + else + foldername = argv[1]; + + bool found = fresh_file; + directory_iterator it{ foldername }; + + for (; it != directory_iterator{}; ++it) + { + if (!is_regular_file(*it)) + { + continue; + } + if (it->path().extension() != ".mtx") + { + continue; + } + if (!found && continue_run) + { + if (current_name.compare(it->path().filename().string()) != 0) + { + // std::cout << "Filename not current name\n"; + // std::cout << it->path().filename() << it->path().filename().string().length() << std::endl; + // std::cout << current_name << current_name.length() << std::endl; + continue; + } + else + found = true; + } + + std::string testname = it->path().filename().stem().string(); + std::cout << "\n\nrunning " << testname << std::endl; + std::string mantname = it->path().string(); + std::string csr_name = mantname + typeext() + ".hicsr"; + + if (approach_selector & (0x1 << current_approach)) + { + try + { + std::cout << "trying to load csr file \"" << csr_name << "\"\n"; + csrmat = loadCSR(csr_name.c_str()); + std::cout << "succesfully loaded: \"" << csr_name << "\"\n"; + } + catch (std::exception& ex) + { + std::cout << "could not load csr file:\n\t" << ex.what() << "\n"; + try + { + std::cout << "trying to load mtx file \"" << mantname << "\"\n"; + COO coo_mat = loadMTX(mantname.c_str()); + convert(csrmat, coo_mat); + std::cout << "succesfully loaded and converted: \"" << csr_name << "\"\n"; + } + catch (std::exception& ex) + { + std::cout << ex.what() << std::endl; + std::cout << "Skipping matrix \"" << mantname.c_str() << "\"\n"; + continue; + } + try + { + std::cout << "write csr file for future use\n"; + storeCSR(csrmat, csr_name.c_str()); + } + catch (std::exception& ex) + { + std::cout << ex.what() << std::endl; + } + } + } + + if (current_approach == first_approach) + { + auto rowStats = csrmat.rowStatistics(); + + results << testname << ";"; + results << csrmat.rows << ";" << csrmat.cols << ";" << csrmat.nnz << ";" + << rowStats.mean << ";" << rowStats.std_dev << ";" << rowStats.min << ";" << rowStats.max << ";"; + + mem_consumption << testname << ";"; + mem_consumption << csrmat.rows << ";" << csrmat.cols << ";" << csrmat.nnz << ";" + << rowStats.mean << ";" << rowStats.std_dev << ";" << rowStats.min << ";" << rowStats.max << ";"; + + ours_detailed << testname << ";"; + ours_detailed << csrmat.rows << ";" << csrmat.cols << ";" << csrmat.nnz << ";" + << rowStats.mean << ";" << rowStats.std_dev << ";" << rowStats.min << ";" << rowStats.max << ";"; + } + + if (continue_run) + stateout << it->path().filename().string() << std::endl << current_approach << std::endl; + + if (runtests) + { + std::cout << "Matrix: " << csrmat.rows << "x" << csrmat.cols << ": " << csrmat.nnz << " nonzeros\n"; + + int32_t warmup = 20; + int32_t iterations = 20; + + // if (csrmat.nnz>= 5000000) + // { + // warmup = 2; + // iterations = 10; + // } + + try + { + dCSR gpu_csrmat, gpu_csrmat2, d_csr_cuRes; + convert(gpu_csrmat, csrmat, 0); + cuSPARSE::CuSparseTest cusparse; + + //calculate the transpose if matrix is not square + if (gpu_csrmat.rows != gpu_csrmat.cols) + { + cusparse.Transpose(gpu_csrmat, gpu_csrmat2); + convert(csrmat2, gpu_csrmat2); + } + else + { + convert(gpu_csrmat2, csrmat, 0); + convert(csrmat2, csrmat, 0); + } + + //generate reference solution using cuSparse + unsigned cuSubdiv_nnz = 0; + if (current_approach != 0 || current_approach == first_approach) + { + cusparse.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes, cuSubdiv_nnz); + + if (current_approach == first_approach) + { + //write out stats of result matrix + CSR h_csr_cuRes; + convert(h_csr_cuRes, d_csr_cuRes); + writeMatrixStats(h_csr_cuRes, testname, statsout); + size_t fpo = countFloatingPointOperations(csrmat, csrmat2); + std::cout << "Multiplication Requires " << fpo << " Floating point operations" << std::endl; + statsout << fpo << std::endl; + results << fpo << ";"; + statsout.flush(); + statsout.close(); + } + } + + switch (current_approach) + { + case 0: + { + cuSPARSE::CuSparseTest cuSparseTest; + + unsigned cuSubdiv_nnz = 0; + double cuSparse_duration = 0; + for (int i = 0; i < warmup; i++) + { + cuSparseTest.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes, cuSubdiv_nnz); + } + + for (int i = 0; i < iterations; i++) + { + auto duration = cuSparseTest.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes, cuSubdiv_nnz); + cuSparse_duration += duration; + } + cuSparse_duration /= static_cast(iterations); + std::cout << std::setw(20) << "cuSparse -> NNZ: " << cuSubdiv_nnz << std::endl; + std::cout << std::setw(20) << "cuSparse SpGEMM: " << cuSparse_duration << " ms" << std::endl; + + results << cuSparse_duration << ";"; + stateout << 1 << std::endl; + break; + } + case 1: + { + dCSR d_csr_hiRes; + ExecutionStats stats, warmupstats, output_stats; + stats.measure_all = false; + warmupstats.measure_all = false; + output_stats.measure_all = false; + double hisparse_duration = 0; + double duration_blockstarts = 0.0; + double duration_spgemm = 0.0; + double duration_merge_case_computation = 0.0; + double duration_merge_simple = 0.0; + double duration_merge_max = 0.0; + double duration_merge_generalized = 0.0; + double duration_write_csr = 0.0; + + // Warmup iterations for multiplication + for (int i = 0; i < warmup; ++i) + { + warmupstats.reset(); + ACSpGEMM::Multiply(gpu_csrmat, gpu_csrmat2, d_csr_hiRes, DefaultTraits, warmupstats, false); + } + + // Multiplication + for (int i = 0; i < iterations; ++i) + { + stats.reset(); + ACSpGEMM::Multiply(gpu_csrmat, gpu_csrmat2, d_csr_hiRes, DefaultTraits, stats, false); + output_stats += stats; + } + + output_stats.normalize(); + hisparse_duration = output_stats.duration; + duration_blockstarts = output_stats.duration_blockstarts; + duration_spgemm = output_stats.duration_spgemm; + duration_merge_case_computation = output_stats.duration_merge_case_computation; + duration_merge_simple = output_stats.duration_merge_simple; + duration_merge_max = output_stats.duration_merge_max; + duration_merge_generalized = output_stats.duration_merge_generalized; + duration_write_csr = output_stats.duration_write_csr; + + + std::cout << std::setw(20) << "ac-SpGEMM -> NNZ: " << d_csr_hiRes.nnz << std::endl; + std::cout << std::setw(20) << "ac-SpGEMM SpGEMM: " << hisparse_duration << " ms" << std::endl; + + output_stats.mem_clear_return = ACSpGEMM::clear_consistentMemory(); + + if (ACSpGEMM::Compare(d_csr_cuRes, d_csr_hiRes, false)) + { + results << hisparse_duration << ";"; + mem_consumption << output_stats.mem_clear_return + output_stats.mem_allocated_chunks << ";"; + writeDetailedInfo(output_stats, ours_detailed); + } + else + { + results << -2 << ";"; + mem_consumption << -2 << ";"; + ours_detailed << std::endl; + } + + stateout << 1 << std::endl; + break; + } + case 2: + { + // dCSR d_nsparse_result_mat; + // double nsparse_timing{ 0.0 }; + // NSparse::MemStats nsparse_stats; + // // Warmup iterations for multiplication + // for (int i = 0; i < warmup; ++i) + // { + // d_nsparse_result_mat.reset(); + // NSparse::Multiply(gpu_csrmat, gpu_csrmat2, d_nsparse_result_mat/*, nsparse_stats*/); + // } + + // // Multiplication + // for (int i = 0; i < iterations; ++i) + // { + // d_nsparse_result_mat.reset(); + // nsparse_timing += NSparse::Multiply(gpu_csrmat, gpu_csrmat2, d_nsparse_result_mat/*, nsparse_stats*/); + // } + // nsparse_timing /= iterations; + + // std::cout << std::setw(20) << "nsparse -> NNZ: " << d_nsparse_result_mat.nnz << std::endl; + // std::cout << std::setw(20) << "nsparse SpGEMM: " << nsparse_timing << " ms" << std::endl; + + + // if (ACSpGEMM::Compare(d_csr_cuRes, d_nsparse_result_mat, false)) + // { + // results << nsparse_timing << ";"; + // // mem_consumption << nsparse_stats.mem_peak << ";"; + // } + // else + // { + // results << -2 << ";"; + // mem_consumption << -2 << ";"; + // } + + // stateout << 1 << std::endl; + printf("NSparse not included in public repository\n"); + break; + } + case 3: + { + // dCSR d_rmerge_result_mat; + // double rmerge_timing{ 0.0 }; + // uint32_t rmerge_nnz{ 0 }; + // bool bitstable{true}; + // HiSparse::Test::RMergeExecutionStats rmerge_stats; + // HostVector rmerge_offsets(csrmat.row_offsets.get(), csrmat.rows + 1); + // rmerge_offsets[csrmat.rows] = csrmat.nnz; + // HostVector rmerge_indices(csrmat.col_ids.get(), csrmat.nnz); + // HostVector rmerge_values(csrmat.data.get(), csrmat.nnz); + // SparseHostMatrixCSR host_A(csrmat.cols, csrmat.rows, rmerge_values, rmerge_indices, rmerge_offsets); + + // HostVector rmerge_offsets2(csrmat2.row_offsets.get(), csrmat2.rows + 1); + // rmerge_offsets2[csrmat2.rows] = csrmat2.nnz; + // HostVector rmerge_indices2(csrmat2.col_ids.get(), csrmat2.nnz); + // HostVector rmerge_values2(csrmat2.data.get(), csrmat2.nnz); + // SparseHostMatrixCSR host_B(csrmat2.cols, csrmat2.rows, rmerge_values2, rmerge_indices2, rmerge_offsets2); + + // SparseDeviceMatrixCSR A = ToDevice(host_A); + // SparseDeviceMatrixCSR B = ToDevice(host_B); + // SparseDeviceMatrixCSR C; + + // for (uint32_t i = 0; i < warmup; ++i) + // { + // RMerge::Multiply(A, B, C); + // } + + // // Multiplication + // for (uint32_t i = 0; i < iterations; ++i) + // { + // rmerge_timing += RMerge::Multiply(A, B, C/*, rmerge_stats*/); + // rmerge_nnz = C.NonZeroCount(); + // } + // rmerge_timing /= iterations; + + // dCSR d_rmerge_result_mat; + // d_rmerge_result_mat.nnz = rmerge_nnz; + // d_rmerge_result_mat.rows = csrmat.rows; + // d_rmerge_result_mat.cols = csrmat2.cols; + // d_rmerge_result_mat.row_offsets = C.RowStarts().Data(); + // d_rmerge_result_mat.col_ids = C.ColIndices().Data(); + // d_rmerge_result_mat.data = C.Values().Data(); + + // std::cout << std::setw(20) << "RMerge -> NNZ: " << rmerge_nnz << std::endl; + // std::cout << std::setw(20) << "RMerge SpGEMM: " << rmerge_timing << " ms" << std::endl; + + // if (ACSpGEMM::Compare(d_csr_cuRes, d_rmerge_result_mat, false)) + // { + // results << rmerge_timing << ";"; + // // mem_consumption << rmerge_stats.mem_peak << ";"; + // } + // else + // { + // results << -2 << ";"; + // mem_consumption << -2 << ";"; + // } + + // // Let the other object destroy the memory + // d_rmerge_result_mat.row_offsets = nullptr; + // d_rmerge_result_mat.col_ids = nullptr; + // d_rmerge_result_mat.data = nullptr; + + // stateout << 1 << std::endl; + printf("RMerge not included in public repository\n"); + break; + } + case 4: + { + // dCSR d_bhSparse_result_mat; + // double bhSparse_timing{ 0.0 }; + // HiSparse::Test::bhSparseExecutionStats bhsparse_stats; + // // Warmup iterations for multiplication + // for (int i = 0; i < warmup; ++i) + // { + // d_bhSparse_result_mat.reset(); + // bhSparse::Multiply(gpu_csrmat, gpu_csrmat2, d_bhSparse_result_mat); + // } + + // // Multiplication + // for (int i = 0; i < iterations; ++i) + // { + // d_bhSparse_result_mat.reset(); + // bhSparse_timing += bhSparse::Multiply(gpu_csrmat, gpu_csrmat2, d_bhSparse_result_mat/*, bhsparse_stats*/); + // } + // bhSparse_timing /= iterations; + + // std::cout << std::setw(20) << "bhSparse -> NNZ: " << d_bhSparse_result_mat.nnz << std::endl; + // std::cout << std::setw(20) << "bhSparse SpGEMM: " << bhSparse_timing << " ms" << std::endl; + + // if (ACSpGEMM::Compare(d_csr_cuRes, d_bhSparse_result_mat, false)) + // { + // results << bhSparse_timing << ";"; + // // mem_consumption << bhsparse_stats.mem_peak << ";"; + // } + // else + // { + // results << -2 << ";"; + // mem_consumption << -2 << ";"; + // } + + // stateout << 1 << std::endl; + // printf("After stateout\n"); + printf("bhSparse not included in public repository\n"); + break; + } + default: + std::cout << "error: wrong test state" << std::endl; + break; + } + } + catch (const SpGEMMException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-4;"; + + if (approach_selector & 14 & (1 << current_approach)) + mem_consumption << "-1;"; + + if (current_approach == 1) + ours_detailed << std::endl; + + stateout << 0 << std::endl; + } + catch (const MergeSimpleCaseException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-5;"; + + if (approach_selector & 14 & (1 << current_approach)) + mem_consumption << "-1;"; + + if (current_approach == 1) + ours_detailed << std::endl; + + stateout << 0 << std::endl; + } + catch (const MergeMaxChunksCaseException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-6;"; + + if (approach_selector & 14 & (1 << current_approach)) + mem_consumption << "-1;"; + + if (current_approach == 1) + ours_detailed << std::endl; + + stateout << 0 << std::endl; + } + catch (const MergeGeneralizedCaseException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-7;"; + + if (approach_selector & 14 & (1 << current_approach)) + mem_consumption << "-1;"; + + if (current_approach == 1) + ours_detailed << std::endl; + + stateout << 0 << std::endl; + } + catch (const MergeLoopingException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-8;"; + + if (approach_selector & 14 & (1 << current_approach)) + mem_consumption << "-1;"; + + if (current_approach == 1) + ours_detailed << std::endl; + + stateout << 0 << std::endl; + } + catch (const RestartOutOfMemoryException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-9;"; + + if (approach_selector & 14 & (1 << current_approach)) + mem_consumption << "-1;"; + + if (current_approach == 1) + ours_detailed << std::endl; + + stateout << 0 << std::endl; + } + catch (const RestartOutOfChunkPointerException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-10;"; + + if (approach_selector & 14 & (1 << current_approach)) + mem_consumption << "-1;"; + + if (current_approach == 1) + ours_detailed << std::endl; + + stateout << 0 << std::endl; + } + catch (const std::exception& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-1;"; + + if (approach_selector & 14 & (1 << current_approach)) + mem_consumption << "-1;"; + + if (current_approach == 1) + ours_detailed << std::endl; + + stateout << 0 << std::endl; + } + results.flush(); + mem_consumption.flush(); + ours_detailed.flush(); + stateout.flush(); + } + results.flush(); + results.close(); + mem_consumption.flush(); + mem_consumption.close(); + ours_detailed.flush(); + ours_detailed.close(); + stateout.flush(); + stateout.close(); + + if (continue_run) + return 1; + } + std::cout << "Test done\n"; + return 0; +} + +// ################################################################# +// +int main(int argc, char *argv[]) +{ +#ifdef _WIN32 + //surpress crash notification windows (close or debug program window) + SetErrorMode(GetErrorMode() | SEM_NOGPFAULTERRORBOX); +#endif + + std::string value_type = argc > 7 ? argv[7] : "f"; + if (value_type.compare("f") == 0) + return performSpGEMMTests(argc, argv); + else + return performSpGEMMTests(argc, argv); +} \ No newline at end of file diff --git a/include/GALATIC/minimal_example.cu b/include/GALATIC/minimal_example.cu new file mode 100644 index 00000000..c59b4de7 --- /dev/null +++ b/include/GALATIC/minimal_example.cu @@ -0,0 +1,134 @@ +/******************************************* +#include "GALATIC/include/CSR.cuh" +#include "GALATIC/include/dCSR.cuh" +#include "GALATIC/include/SemiRingInterface.h" +#include "GALATIC/source/device/Multiply.cuh" + +Your "includes" probably needs to look something like the above, rather than what's below. +*******************************************/ + +//#include "include/CSR.cuh" +//#include "include/dCSR.cuh" +#include "include/SemiRingInterface.h" +#include "include/TestSpGEMM.cuh" +#include + +//#include "source/device/Multiply.cuh" + +struct foo { + double a; +}; + +struct foo2 { + short h; + double a; + double b; + double c; + + double d; + short k; +}; + +struct Arith_SR : SemiRing +{ + __host__ __device__ double multiply(const double& a, const double& b) const { return a * b; } + __host__ __device__ double add(const double& a, const double& b) const { return a + b; } + __host__ __device__ static double AdditiveIdentity() { return 0; } +}; + + +int main(int argc, const char* argv[]) +{ + CSR input_A_CPU; + CSR input_B_CPU; + + COO input_A_COO; + COO input_B_COO; + + CSR result_mat_CPU; + + + + + + printf("%s + %s", argv[1], argv[2]); + input_A_COO = loadMTX(argv[1]); + input_B_COO = loadMTX(argv[2]); + + convert(input_A_CPU, input_A_COO); + convert(input_B_CPU, input_B_COO); + + // [ [ 1, 2], + // [ 3 4 ] ] + cudaDeviceSynchronize(); + + + // Transfer input matrices onto GPU + + + // load data into semiring struct. For this one, we don't need to do anything + Arith_SR semiring; + + + // Setup execution options, we'll skip the details for now. + + const int Threads = 128; + const int BlocksPerMP = 1; + const int NNZPerThread = 2; + const int InputElementsPerThreads = 2; + const int RetainElementsPerThreads = 1; + const int MaxChunksToMerge = 16; + const int MaxChunksGeneralizedMerge = 256; // MAX: 865 + const int MergePathOptions = 8; + + + GPUMatrixMatrixMultiplyTraits DefaultTraits(Threads, BlocksPerMP, NNZPerThread, + InputElementsPerThreads, RetainElementsPerThreads, + MaxChunksToMerge, MaxChunksGeneralizedMerge, MergePathOptions ); + + const bool Debug_Mode = false; + // DefaultTraits.preferLoadBalancing = true; + ExecutionStats stats; + // stats.measure_all = false; + typedef std::chrono::high_resolution_clock Time; + typedef std::chrono::milliseconds ms; + typedef std::chrono::duration fsec; + auto t0 = Time::now(); + + for (int i =0; i < 10000; i++){ + // Actually perform the matrix multiplicaiton + //if (i % 10 == 0) printf("%i\n",i); + dCSR input_A_GPU; + dCSR input_B_GPU; + convert(input_A_GPU, input_A_CPU); + convert(input_B_GPU, input_B_CPU); + cudaDeviceSynchronize(); + dCSR result_mat_GPU; + ACSpGEMM::Multiply(input_A_GPU, input_B_GPU, result_mat_GPU, DefaultTraits, stats, Debug_Mode, semiring); + cudaDeviceSynchronize(); + //std::cout << result_mat_GPU.nnz << std::endl; + convert(result_mat_CPU, result_mat_GPU); + cudaDeviceSynchronize(); + } + auto t1 = Time::now(); + fsec fs = t1 - t0; + ms d = std::chrono::duration_cast(fs); + dCSR result_mat_GPU; + dCSR input_A_GPU; + dCSR input_B_GPU; + convert(input_A_GPU, input_A_CPU); + convert(input_B_GPU, input_B_CPU); + ACSpGEMM::Multiply(input_A_GPU, input_B_GPU, result_mat_GPU, DefaultTraits, stats, Debug_Mode, semiring); + cudaDeviceSynchronize(); + printf("Took %d for 1000 tries, for an average of %d\n", d, (d / 1000)); + TestSpGEMM(input_A_GPU, input_B_GPU, semiring, [=] (const Arith_SR::output_t &a, const Arith_SR::output_t &b) { return std::abs(a-b) < 0.01; }, DefaultTraits); + + convert(result_mat_CPU, result_mat_GPU); + + cudaDeviceSynchronize(); + + for (int i =0; i < 4; i++) { + std::cout << "nnz: " << i << " val " << result_mat_CPU.data[i] << std::endl; + } + +} \ No newline at end of file diff --git a/include/GALATIC/source/checkBitStability.cuh b/include/GALATIC/source/checkBitStability.cuh new file mode 100644 index 00000000..8c9c1b70 --- /dev/null +++ b/include/GALATIC/source/checkBitStability.cuh @@ -0,0 +1,874 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ +* performTestCase.cpp +* +* ac-SpGEMM +* +* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter +*------------------------------------------------------------------------------ +*/ + +// Global includes +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +//surpress crash notification windows (close or debug program window) +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#else +#include +#endif + +// Local includes +#include "CSR.h" +#include "COO.h" +#include "Vector.h" +#include "dCSR.h" +#include "dVector.h" +#include "Multiply.h" +#include "Transpose.h" +#include "Compare.cuh" +#include "consistent_memory.h" +#include "CustomExceptions.h" + +#ifdef _WIN32 +#include +using namespace std::filesystem; +#else +#include +using namespace std::experimental::filesystem; +#endif + +// CuSparse include +#include "cusparse/include/cuSparseMultiply.h" + +// // Nsparse include +// #include "nsparse/include/nsparseMultiply.h" + +// // RMerge include +// #include "RMerge/include/rmergeMultiply.h" + +// // BhSparse include +// #include"bhSparse/include/bhSparseMultiply.h" + +unsigned int padding = 0; +template +std::string typeext(); +template<> +std::string typeext() +{ + return std::string(""); +} +template<> +std::string typeext() +{ + return std::string("d_"); +} + +template +std::string nameextension() +{ + return ""; +} +template<> +std::string nameextension() +{ + return "_d"; +} +template<> +std::string nameextension() +{ + return "_f"; +} + +template +bool isFloat() +{ + return false; +} + +template<> +bool isFloat() +{ + return true; +} + +// ################################################################# +// +uint32_t numTrailingBinaryZeros(uint32_t n) +{ + uint32_t mask = 1; + for (uint32_t i = 0; i < 32; i++, mask <<= 1) + if ((n & mask) != 0) + return i; + + return 32; +} + +// ################################################################# +// +void writeDetailedInfo(const ExecutionStats& stats, std::ofstream& out) +{ + out << stats.shared_rows << ";"; + out << stats.simple_rows << ";"; + out << stats.simple_mergers << ";"; + out << stats.complex_rows << ";"; + out << stats.generalized_rows << ";"; + out << stats.duration << ";"; + out << stats.duration_blockstarts << ";"; + out << stats.duration_spgemm << ";"; + out << stats.duration_merge_case_computation << ";"; + out << stats.duration_merge_simple << ";"; + out << stats.duration_merge_max << ";"; + out << stats.duration_merge_generalized << ";"; + out << stats.duration_write_csr << ";"; + out << stats.mem_clear_return << ";"; + out << stats.mem_allocated_chunks << ";"; + out << stats.mem_used_chunks << ";"; + out << stats.restarts << ";"; + out << std::endl; +} + +// ################################################################# +// +void getNextMatrix(const char* foldername, const std::string& lastname, std::string& nextname) +{ + bool found_last = false; + directory_iterator it{ foldername }; + for (; it != directory_iterator{}; ++it) + { + if (!is_regular_file(*it)) + continue; + if (it->path().extension() != ".mtx") + continue; + if (!found_last) + { + if (it->path().filename() != lastname) + continue; + else + { + found_last = true; + continue; + } + } + else + { + nextname = it->path().filename().string(); + return; + } + } + nextname = std::string(""); + return; +} + +// ################################################################# +// +std::string getColumnHeaders(uint32_t approaches, std::string prefix = "") +{ + std::string headers(prefix); + + if (approaches & (0x1 << 0)) + headers.append("cuSparse;"); + if (approaches & (0x1 << 1)) + headers.append("acSpGEMM;"); + // if (approaches & (0x1 << 2)) + // headers.append("nsparse;"); + // if (approaches & (0x1 << 3)) + // headers.append("RMerge;"); + // if (approaches & (0x1 << 4)) + // headers.append("bhSparse;"); + + headers.append("\n"); + + return headers; +} + +// ################################################################# +// +template +void writeMatrixStats(CSR& mat, const std::string matname, std::ofstream& outfs) +{ + typename CSR::Statistics stats = mat.rowStatistics(); + //"\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max; + outfs << matname << ";" << mat.rows << ";" << mat.cols << ";" << mat.nnz << ";" + << stats.mean << ";" << stats.std_dev << ";" << stats.min << ";" << stats.max << ";"; +} + +// ################################################################# +// +template +size_t countFloatingPointOperations(CSR& matA, CSR& matB) +{ + size_t count = 0; + for (auto nnzAiter = 0; nnzAiter < matA.nnz; ++nnzAiter) + count += matB.row_offsets[matA.col_ids[nnzAiter] + 1] - matB.row_offsets[matA.col_ids[nnzAiter]]; + return count; +} + +// ################################################################# +// +std::ostream& writeGPUInfo(std::ostream& file) +{ + int cudaDevice; + cudaGetDevice(&cudaDevice); + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, cudaDevice); + std::cout << "Going to use " << prop.name << " " << prop.major << "." << prop.minor << "\n"; + + file << "name;cc;num_multiprocessors;warp_size;max_threads_per_mp;regs_per_mp;shared_memory_per_mp;total_constant_memory;total_global_memory;clock_rate;max_threads_per_block;max_regs_per_block;max_shared_memory_per_block\n" + << prop.name << ';' + << prop.major << '.' + << prop.minor << ';' + << prop.multiProcessorCount << ';' + << prop.warpSize<< ';' + << prop.maxThreadsPerMultiProcessor << ';' + << prop.regsPerMultiprocessor << ';' + << prop.sharedMemPerMultiprocessor << ';' + << prop.totalConstMem << ';' + << prop.totalGlobalMem << ';' + << prop.clockRate * 1000 << ';' + << prop.maxThreadsPerBlock << ';' + << prop.regsPerBlock << ';' + << prop.sharedMemPerBlock + << std::endl; + return file; +} + +// ################################################################# +// +template +int performSpGEMMTests(int argc, char ** argv) +{ + std::string name_extension = ""; + + bool runtests = true; + if (argc > 2) + runtests = std::string(argv[2]) != "0"; + + int cudaDevice = 0; + if (argc > 3) + cudaDevice = std::atoi(argv[3]); + + bool continue_run = false; + if (argc > 4) + continue_run = std::string(argv[4]) != "0"; + + std::vector trait_init = { 256, 3, 2, 4, 4, 16, 256, 8 }; + if (argc > 5) + { + + std::istringstream traitstream(argv[5]); + std::vector input_trait_init; + std::string val; + while (std::getline(traitstream, val, ',')) + input_trait_init.push_back(std::stoi(val)); + + if (input_trait_init.size() != trait_init.size()) + printf("Malformed trait init input param; %zu params required; fallback to default\n", trait_init.size()); + else + trait_init = input_trait_init; + } + + uint32_t approach_selector = 0xFFFFFFFF; + uint32_t first_approach = 0; + if (argc > 6) + { + approach_selector = std::stoi(argv[6]); + first_approach = numTrailingBinaryZeros(approach_selector); + if (approach_selector == 0) + { + printf("ERROR: No approaches selected for testing\n"); + return 0; + } + } + + cudaSetDevice(cudaDevice); + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, cudaDevice); + std::cout << "Going to use " << prop.name << " " << prop.major << "." << prop.minor << "\n"; + std::string gpuname = prop.name; + + GPUMatrixMatrixMultiplyTraits DefaultTraits(trait_init[0], trait_init[1], trait_init[2], trait_init[3], trait_init[4], trait_init[5], trait_init[6], trait_init[7]); + DefaultTraits.preferLoadBalancing = true; + + std::ofstream results; + std::ofstream stateout; + std::ofstream statsout; //This will go horribly wrong: stateout vs statsout + std::string trait_string = + std::to_string(trait_init[0]) + + "_" + std::to_string(trait_init[1]) + + "_" + std::to_string(trait_init[2]) + + "_" + std::to_string(trait_init[3]) + + "_" + std::to_string(trait_init[4]) + + "_" + std::to_string(trait_init[5]) + + "_" + std::to_string(trait_init[6]) + + "_" + std::to_string(trait_init[7]) + "_"; + std::string statefile = std::string("bit_") + trait_string + nameextension() + name_extension + gpuname + ".state"; + std::string lastname; + std::string current_name; + unsigned num_approaches = 6; + unsigned current_approach = first_approach; + bool finished_write = true; + bool fresh_file = !continue_run; + if (continue_run) + { + std::ifstream last(statefile.c_str()); + if (last) + { + std::getline(last, lastname); + current_name = lastname; + std::cout << "Continuing run after " << lastname << std::endl; + results.open((std::string("bit_") + trait_string + nameextension() + name_extension + gpuname + ".csv").c_str(), std::ios_base::app); + statsout.open("matrix_stats.csv", std::ios_base::app); + std::cout << "After open" << std::endl; + + + std::time_t now = std::time(NULL); + std::tm * ptm = std::localtime(&now); + char buffer[32]; + // Format: Mo, 15.06.2009 20:20:00 + std::strftime(buffer, 32, "%a, %d.%m.%Y %H:%M:%S", ptm); + std::cout << buffer << std::endl; + + std::string lastapproach; + std::getline(last, lastapproach); + current_approach = (std::stoi(lastapproach) + 1) % num_approaches; + std::string finished_write_string; + std::getline(last, finished_write_string); + finished_write = !finished_write_string.empty(); + + if (!finished_write) + { + results << -3 << ";"; + finished_write = true; + } + + last.close(); + + if (!(approach_selector & (0x1 << current_approach))) + { + //this limits us to 31 approaches :-p + uint32_t next_offset = numTrailingBinaryZeros((approach_selector & 0xEFFFFFFF) >> current_approach); + if (next_offset < sizeof(uint32_t) * 8) + { + current_approach += next_offset; + } + else + { + current_approach = first_approach; + + results << std::endl; + + const char *foldername = argc == 1 ? "." : argv[1]; + getNextMatrix(foldername, lastname, current_name); + + if (current_name.empty()) + { + return 0; + } + + } + } + else if (current_approach < std::stoi(lastapproach)) + { + const char *foldername = argc == 1 ? "." : argv[1]; + getNextMatrix(foldername, lastname, current_name); + + if (current_name.empty()) + { + return 0; + } + + results << std::endl; + + if (current_name.empty()) + { + return 0; + } + } + } + else + { + fresh_file = true; + } + last.close(); + stateout.open(statefile.c_str()); + } + + if (fresh_file) + { + + results.open((std::string("bit_") + trait_string + nameextension() + name_extension + gpuname + ".csv").c_str()); + results << "\"sep=;\"\n"; + writeGPUInfo(results); + results << getColumnHeaders(approach_selector, "\nMatrix;rows;cols;nnz;r_mean;r_std_dev;r_min;r_max;Products;"); + + statsout.open("matrix_stats.csv", std::ios_base::app); + statsout << "\"sep=;\"\n"; + statsout << "\nMatrix; rows; cols; nnz; r_mean; r_std_dev; r_min; r_max;" << std::endl; + } + + + CSR csrmat, csrmat2, result_mat; + + char *foldername; + if (argc == 1) + { + foldername = const_cast("."); + } + else + foldername = argv[1]; + + bool found = fresh_file; + directory_iterator it{ foldername }; + + for (; it != directory_iterator{}; ++it) + { + if (!is_regular_file(*it)) + { + continue; + } + if (it->path().extension() != ".mtx") + { + continue; + } + if (!found && continue_run) + { + if (current_name.compare(it->path().filename().string()) != 0) + { + // std::cout << "Filename not current name\n"; + // std::cout << it->path().filename() << it->path().filename().string().length() << std::endl; + // std::cout << current_name << current_name.length() << std::endl; + continue; + } + else + found = true; + } + + std::string testname = it->path().filename().stem().string(); + std::cout << "\n\nrunning " << testname << std::endl; + std::string mantname = it->path().string(); + std::string csr_name = mantname + typeext() + ".hicsr"; + + if (approach_selector & (0x1 << current_approach)) + { + try + { + std::cout << "trying to load csr file \"" << csr_name << "\"\n"; + csrmat = loadCSR(csr_name.c_str()); + std::cout << "succesfully loaded: \"" << csr_name << "\"\n"; + } + catch (std::exception& ex) + { + std::cout << "could not load csr file:\n\t" << ex.what() << "\n"; + try + { + std::cout << "trying to load mtx file \"" << mantname << "\"\n"; + COO coo_mat = loadMTX(mantname.c_str()); + convert(csrmat, coo_mat); + std::cout << "succesfully loaded and converted: \"" << csr_name << "\"\n"; + } + catch (std::exception& ex) + { + std::cout << ex.what() << std::endl; + std::cout << "Skipping matrix \"" << mantname.c_str() << "\"\n"; + continue; + } + try + { + std::cout << "write csr file for future use\n"; + storeCSR(csrmat, csr_name.c_str()); + } + catch (std::exception& ex) + { + std::cout << ex.what() << std::endl; + } + } + } + + if (current_approach == first_approach) + { + auto rowStats = csrmat.rowStatistics(); + + results << testname << ";"; + results << csrmat.rows << ";" << csrmat.cols << ";" << csrmat.nnz << ";" + << rowStats.mean << ";" << rowStats.std_dev << ";" << rowStats.min << ";" << rowStats.max << ";"; + } + + if (continue_run) + stateout << it->path().filename().string() << std::endl << current_approach << std::endl; + + if (runtests) + { + std::cout << "Matrix: " << csrmat.rows << "x" << csrmat.cols << ": " << csrmat.nnz << " nonzeros\n"; + + int32_t iterations = 20; + + try + { + dCSR gpu_csrmat, gpu_csrmat2, d_csr_cuRes; + convert(gpu_csrmat, csrmat, 0); + cuSPARSE::CuSparseTest cusparse; + + //calculate the transpose if matrix is not square + if (gpu_csrmat.rows != gpu_csrmat.cols) + { + cusparse.Transpose(gpu_csrmat, gpu_csrmat2); + convert(csrmat2, gpu_csrmat2); + } + else + { + convert(gpu_csrmat2, csrmat, 0); + convert(csrmat2, csrmat, 0); + } + + //generate reference solution using cuSparse + unsigned cuSubdiv_nnz = 0; + if (current_approach != 0 || current_approach == first_approach) + { + cusparse.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes, cuSubdiv_nnz); + + if (current_approach == first_approach) + { + //write out stats of result matrix + CSR h_csr_cuRes; + convert(h_csr_cuRes, d_csr_cuRes); + writeMatrixStats(h_csr_cuRes, testname, statsout); + size_t fpo = countFloatingPointOperations(csrmat, csrmat2); + std::cout << "Multiplication Requires " << fpo << " Floating point operations" << std::endl; + statsout << fpo << std::endl; + results << fpo << ";"; + statsout.flush(); + statsout.close(); + } + } + + switch (current_approach) + { + case 0: + { + dCSR d_csr_cuRes_comp; + cuSPARSE::CuSparseTest cuSparseTest; + bool bitstable{true}; + + for (int i = 0; i < iterations; i++) + { + if(i == 0) + cuSparseTest.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes, cuSubdiv_nnz); + else + { + cuSparseTest.Multiply(gpu_csrmat, gpu_csrmat2, d_csr_cuRes_comp, cuSubdiv_nnz); + if (!(ACSpGEMM::Compare(d_csr_cuRes, d_csr_cuRes_comp, true))) + { + printf("cuSparse: ## NOT ## Bit-Identical\n"); + results << -999 << ";"; + bitstable = false; + break; + } + } + } + if(bitstable) + { + printf("cuSparse: Bit-Identical\n"); + results << 0 << ";"; + } + stateout << 1 << std::endl; + break; + } + case 1: + { + dCSR d_csr_hiRes, d_csr_hiRes_comp; + ExecutionStats stats; + stats.measure_all = false; + bool bitstable{true}; + + // Multiplication + for (int i = 0; i < iterations; ++i) + { + stats.reset(); + if(i == 0) + ACSpGEMM::Multiply(gpu_csrmat, gpu_csrmat2, d_csr_hiRes, DefaultTraits, stats, false); + else + { + ACSpGEMM::Multiply(gpu_csrmat, gpu_csrmat2, d_csr_hiRes_comp, DefaultTraits, stats, false); + if (!(ACSpGEMM::Compare(d_csr_hiRes, d_csr_hiRes_comp, true))) + { + printf("AcSpGEMM: ## NOT ## Bit-Identical\n"); + results << -999 << ";"; + bitstable = false; + break; + } + } + } + + if(bitstable) + { + printf("AcSpGEMM: Bit-Identical\n"); + results << 0 << ";"; + } + stateout << 1 << std::endl; + break; + } + case 2: + { + // dCSR d_nsparse_result_mat, d_nsparse_result_mat_comp; + // bool bitstable{true}; + // // Multiplication + // for (int i = 0; i < iterations; ++i) + // { + // d_nsparse_result_mat_comp.reset(); + // if(i == 0) + // NSparse::Multiply(gpu_csrmat, gpu_csrmat2, d_nsparse_result_mat); + // else + // { + // NSparse::Multiply(gpu_csrmat, gpu_csrmat2, d_nsparse_result_mat_comp); + // if (!(ACSpGEMM::Compare(d_nsparse_result_mat, d_nsparse_result_mat_comp, true))) + // { + // printf("Nsparse: ## NOT ## Bit-Identical\n"); + // results << -999 << ";"; + // bitstable = false; + // break; + // } + // } + // } + + // if(bitstable) + // { + // printf("Nsparse: Bit-Identical\n"); + // results << 0 << ";"; + // } + + // stateout << 1 << std::endl; + printf("Nsparse not included in public repository\n"); + break; + } + case 3: + { + // bool bitstable{true}; + // uint32_t rmerge_nnz{ 0 }; + // HiSparse::Test::RMergeExecutionStats rmerge_stats; + // HostVector rmerge_offsets(csrmat.row_offsets.get(), csrmat.rows + 1); + // rmerge_offsets[csrmat.rows] = csrmat.nnz; + // HostVector rmerge_indices(csrmat.col_ids.get(), csrmat.nnz); + // HostVector rmerge_values(csrmat.data.get(), csrmat.nnz); + // SparseHostMatrixCSR host_A(csrmat.cols, csrmat.rows, rmerge_values, rmerge_indices, rmerge_offsets); + + // HostVector rmerge_offsets2(csrmat2.row_offsets.get(), csrmat2.rows + 1); + // rmerge_offsets2[csrmat2.rows] = csrmat2.nnz; + // HostVector rmerge_indices2(csrmat2.col_ids.get(), csrmat2.nnz); + // HostVector rmerge_values2(csrmat2.data.get(), csrmat2.nnz); + // SparseHostMatrixCSR host_B(csrmat2.cols, csrmat2.rows, rmerge_values2, rmerge_indices2, rmerge_offsets2); + + // SparseDeviceMatrixCSR A = ToDevice(host_A); + // SparseDeviceMatrixCSR B = ToDevice(host_B); + // SparseDeviceMatrixCSR C, C_comp; + + + // RMerge::Multiply(A, B, C); + // dCSR d_rmerge_result_mat, d_rmerge_result_mat_comp; + // d_rmerge_result_mat.nnz = rmerge_nnz; + // d_rmerge_result_mat.rows = csrmat.rows; + // d_rmerge_result_mat.cols = csrmat2.cols; + // d_rmerge_result_mat.row_offsets = C.RowStarts().Data(); + // d_rmerge_result_mat.col_ids = C.ColIndices().Data(); + // d_rmerge_result_mat.data = C.Values().Data(); + + // // Multiplication + // for (uint32_t i = 0; i < iterations; ++i) + // { + // RMerge::Multiply(A, B, C_comp); + // rmerge_nnz = C_comp.NonZeroCount(); + // d_rmerge_result_mat_comp.nnz = rmerge_nnz; + // d_rmerge_result_mat_comp.rows = csrmat.rows; + // d_rmerge_result_mat_comp.cols = csrmat2.cols; + // d_rmerge_result_mat_comp.row_offsets = C_comp.RowStarts().Data(); + // d_rmerge_result_mat_comp.col_ids = C_comp.ColIndices().Data(); + // d_rmerge_result_mat_comp.data = C_comp.Values().Data(); + // if (!(ACSpGEMM::Compare(d_rmerge_result_mat, d_rmerge_result_mat, true))) + // { + // printf("RMerge: ## NOT ## Bit-Identical\n"); + // results << -999 << ";"; + // bitstable = false; + // break; + // } + // } + + // // Let the other object destroy the memory + // d_rmerge_result_mat.row_offsets = nullptr; + // d_rmerge_result_mat.col_ids = nullptr; + // d_rmerge_result_mat.data = nullptr; + + // if(bitstable) + // { + // printf("RMerge: Bit-Identical\n"); + // results << 0 << ";"; + // } + // stateout << 1 << std::endl; + printf("RMerge not included in public repository\n"); + break; + } + case 4: + { + // dCSR d_bhSparse_result_mat, d_bhSparse_result_mat_comp; + // bool bitstable{true}; + // HiSparse::Test::bhSparseExecutionStats bhsparse_stats; + + // // Multiplication + // for (int i = 0; i < iterations; ++i) + // { + // d_bhSparse_result_mat_comp.reset(); + // if(i == 0) + // bhSparse::Multiply(gpu_csrmat, gpu_csrmat2, d_bhSparse_result_mat); + // else + // { + // bhSparse::Multiply(gpu_csrmat, gpu_csrmat2, d_bhSparse_result_mat_comp); + // if (!(ACSpGEMM::Compare(d_bhSparse_result_mat, d_bhSparse_result_mat_comp, true))) + // { + // printf("BhSparse: ## NOT ## Bit-Identical\n"); + // results << -999 << ";"; + // bitstable = false; + // break; + // } + // } + // } + + // if(bitstable) + // { + // printf("BhSparse: Bit-Identical\n"); + // results << 0 << ";"; + // } + // stateout << 1 << std::endl; + printf("bhSparse not included in public repository\n"); + break; + } + default: + std::cout << "error: wrong test state" << std::endl; + break; + } + } + catch (const SpGEMMException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-4;"; + + stateout << 0 << std::endl; + } + catch (const MergeSimpleCaseException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-5;"; + + stateout << 0 << std::endl; + } + catch (const MergeMaxChunksCaseException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-6;"; + + stateout << 0 << std::endl; + } + catch (const MergeGeneralizedCaseException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-7;"; + + stateout << 0 << std::endl; + } + catch (const MergeLoopingException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-8;"; + + stateout << 0 << std::endl; + } + catch (const RestartOutOfMemoryException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-9;"; + + stateout << 0 << std::endl; + } + catch (const RestartOutOfChunkPointerException& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-10;"; + + stateout << 0 << std::endl; + } + catch (const std::exception& e) { + std::cout << "Error:\n" << e.what() << "\n"; + + results << "-1;"; + + stateout << 0 << std::endl; + } + results.flush(); + stateout.flush(); + } + results.flush(); + results.close(); + stateout.flush(); + stateout.close(); + + if (continue_run) + return 1; + } + std::cout << "Test done\n"; + return 0; +} + +// ################################################################# +// +int main(int argc, char *argv[]) +{ +#ifdef _WIN32 + //surpress crash notification windows (close or debug program window) + SetErrorMode(GetErrorMode() | SEM_NOGPFAULTERRORBOX); +#endif + + std::string value_type = argc > 7 ? argv[7] : "f"; + if (value_type.compare("f") == 0) + return performSpGEMMTests(argc, argv); + else + return performSpGEMMTests(argc, argv); +} \ No newline at end of file diff --git a/include/GALATIC/source/device/Compare.cuh b/include/GALATIC/source/device/Compare.cuh new file mode 100644 index 00000000..539e8b37 --- /dev/null +++ b/include/GALATIC/source/device/Compare.cuh @@ -0,0 +1,98 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ +* Compare.cu +* +* ac-SpGEMM +* +* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter +*------------------------------------------------------------------------------ +*/ + +// Global includes +#include +#include + +// Local includes +#include "Compare.h" +#include "common.h" + +//#define VERIFICATION_TEXT + +template +__global__ void d_compare(int in_rows, int in_cols, const uint32_t* __restrict reference_offset, const uint32_t* __restrict reference_indices, const DataType* __restrict reference_values, + const uint32_t* __restrict compare_offset, const uint32_t* __restrict compare_indices, const DataType* __restrict compare_values, bool compare_data, double epsilon, uint32_t* verification) +{ + int tid = threadIdx.x + blockDim.x * blockIdx.x; + if (tid >= in_rows) + return; + + uint32_t ref_offset = reference_offset[tid]; + uint32_t comp_offset = compare_offset[tid]; + uint32_t ref_number_entries = reference_offset[tid + 1] - ref_offset; + uint32_t comp_number_entries = compare_offset[tid + 1] - comp_offset; + + if (ref_number_entries != comp_number_entries) + { +#ifdef VERIFICATION_TEXT + printf("---------- Row: %u | Row length not identical: (Ref|Comp) : (%u|%u)\n",tid, ref_number_entries, comp_number_entries); +#endif + *verification = 1; + } + + uint32_t num_entries = min(ref_number_entries, comp_number_entries); + + for (uint32_t i = 0; i < num_entries; ++i) + { + if (reference_indices[ref_offset + i] != compare_indices[comp_offset + i]) + { +#ifdef VERIFICATION_TEXT + printf("Row: %u | Row indices do NOT match: (Ref|Comp) : (%u|%u) - pos: %u/%u\n", tid, reference_indices[ref_offset + i], compare_indices[comp_offset + i], i, num_entries); +#endif + *verification = 1; + } + if (compare_data) + { + if (reference_values[ref_offset + i] != compare_values[comp_offset + i]) + { +#ifdef VERIFICATION_TEXT + printf("Row: %u | Values do NOT match: (Ref|Comp) : (%f|%f) - pos: %u/%u\n", tid, reference_values[ref_offset + i], compare_values[comp_offset + i], i, num_entries); +#endif + *verification = 1; + } + } + } + + return; +} +}; + diff --git a/include/GALATIC/source/device/Multiply.cuh b/include/GALATIC/source/device/Multiply.cuh new file mode 100644 index 00000000..2d295151 --- /dev/null +++ b/include/GALATIC/source/device/Multiply.cuh @@ -0,0 +1,938 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ +* Multiply.cpp +* +* ac-SpGEMM +* +* Authors: Daniel Mlakar, Markus Steinberger, Martin Winter +*------------------------------------------------------------------------------ +*/ +#pragma once + +#include "memory.cuh" +// Global includes +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include +#define LZCNT __lzcnt +#else +//#include +#define LZCNT __builtin_clzll +#endif + +// Local includes +#include "../../include/Multiply.h" +#include "../../include/device/MultiplyKernels.h" +#include "../../include/device/consistent_gpu_memory.h" +#include "../../include/devicetools/stream.h" +#include "../../include/meta_utils.h" +#include "../../include/device/acSpGEMM_DetermineBlockStarts.cuh" +#include "../../include/device/acSpGEMM_SpGEMM.cuh" +#include "../../include/device/acSpGEMM_MergeSimple.cuh" +#include "../../include/device/acSpGEMM_MergeMaxChunks.cuh" +#include "../../include/device/acSpGEMM_MergeGeneralized.cuh" +#include "../../include/device/acSpGEMM_ChunksToCSR.cuh" +#include "../../include/device/HelperFunctions.cuh" +#include "../../include/CustomExceptions.h" + + +#pragma once + +#include "../../include/dCSR.cuh" +#include "../../include/execution_stats.h" +#include "../../include/default_scheduling_traits.h" + +void startTimer(cudaEvent_t& start, CUstream stream = 0) +{ + HANDLE_ERROR(cudaEventRecord(start, stream)); +} + +float recordTimer(cudaEvent_t& start, cudaEvent_t& end, CUstream stream = 0) +{ + float time; + HANDLE_ERROR(cudaEventRecord(end, stream)); + HANDLE_ERROR(cudaEventSynchronize(end)); + HANDLE_ERROR(cudaEventElapsedTime(&time, start, end)); + return time; + return 0; +} + +using IndexType = uint32_t; +using OffsetType = uint32_t; + + +namespace ACSpGEMM { + + template + __host__ __forceinline__ T divup(T a, T b) + { + return (a + b - 1) / b; + } + + template + __host__ __forceinline__ T alignment(T size, size_t alignment) + { + return divup(size, alignment) * alignment; + } + + int id; + + template + void MultiplyImplementation(const dCSR& matA, const dCSR& matB, dCSR& matOut, const GPUMatrixMatrixMultiplyTraits& traits, ExecutionStats& stats, SEMIRING_t semiring) + { + HANDLE_ERROR(cudaGetLastError()); + + using ConsistentGPUMemory = ConsistentMemory; + + // the magic numbers to make it run smoother + const float OverallocationFactor = 1.1f; + const int ChunkPointerOverestimationFactor = 4; + const float ChunkOverallocationFactor = 1.0f; + using UintBitSet = std::bitset; + + if(DEBUG_MODE) + { + std::cout << "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n"; + std::cout << "THREADS: " << threads << " | NNZPerThread: " << nnz_per_thread << " | InputElementsPerThreads: " << input_elements_per_thread << " | RetainElementsPerThreads: " << retain_elements_per_thread; + std::cout << " | MaxChunks: " << merge_max_chunks << " | MergePathOptions: " << merge_max_path_options << "| ChunkpointerOverestimationFactor: " << ChunkPointerOverestimationFactor << "\n"; + std::cout << "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n"; + } + + // Helper variables + size_t memory_usage_in_Bytes{ 0 }; + const size_t chunckAllocationsSize{ 256 }; + const size_t numFlags{ 128 }; + const size_t numCounters{ 3 }; + const size_t mergeTypeCounters{ 4 }; + static size_t maxExpectedNNZ{ 500000000 }; //limit allocation... + static size_t minExpectedNNZ{ 10000000 }; //limit allocation... + // 10000000 + static float lastChunckBufferRequirementRatio{ 1.0f }; + const uint32_t nnzperblock{ threads * nnz_per_thread }; + size_t run{ 0 }, chunk_pointer_restart_run{ 0 }; + bool completed{ false }; + bool rowmerging{ false }; + MergeCaseOffsets mergeBlocks; + uint32_t* currentCounters, *currentChunckAllocation, *currentFlag; + uint32_t numSharedRows; + size_t size_to_allocate; + size_t upper_limit{ 3LL * 1024 * 1024 * 1024 }; + + // Kernels + AcSpGEMMKernels spgemm(threads); + + // Matrix information + size_t Arows = matA.rows; + size_t Acols = matA.cols; + size_t Brows = matB.rows; + size_t Bcols = matB.cols; + size_t Crows = Arows; + size_t Ccols = Bcols; + + if (Acols != Brows) + throw std::runtime_error("Unable to multiply matrix with matrix - invalid dimensions"); + + // Matrix Output estimation + double a_avg_row = matA.nnz / static_cast(Arows); + double b_avg_row = matB.nnz / static_cast(Brows); + double avg_row_overlap = b_avg_row / Bcols; + // note geometric sequence + double output_estimate = OverallocationFactor*Arows*b_avg_row * (1.0 - pow(1.0 - avg_row_overlap, a_avg_row)) / (avg_row_overlap); + + // chunks might get created earlier + double single_chunk_estimate = b_avg_row; + double current_overlap = avg_row_overlap; + double merges; + for (merges = 1; merges < static_cast(a_avg_row + 1.0); ++merges) + { + if (single_chunk_estimate >= retain_elements_per_thread*threads) + break; + single_chunk_estimate += (1 - current_overlap)*b_avg_row; + current_overlap = current_overlap + (1 - current_overlap)*avg_row_overlap; + } + HANDLE_ERROR(cudaGetLastError()); + + double intermediate_estimate = OverallocationFactor * a_avg_row / std::min(merges, a_avg_row) * single_chunk_estimate * Arows; + double mergepointer_estimate = std::max(intermediate_estimate, output_estimate) / (retain_elements_per_thread*threads) + 16 * 1024; + size_t expectedNNZ = std::max(minExpectedNNZ, std::min(maxExpectedNNZ, static_cast(lastChunckBufferRequirementRatio*std::max(intermediate_estimate, output_estimate)))); + size_to_allocate = (std::max(sizeof(typename SEMIRING_t::rightInput_t), sizeof(typename SEMIRING_t::output_t))+ sizeof(IndexType))*expectedNNZ*ChunkOverallocationFactor; + size_t free, total; + cudaMemGetInfo(&free, &total); + upper_limit = std::min(upper_limit, free / 3); + if (size_to_allocate > upper_limit) + size_to_allocate = upper_limit; + if(DEBUG_MODE) + { + std::cout << "A: " << Arows << "x" << Acols << " NNZ: " << matA.nnz << " avg row: " << a_avg_row << " " << "B: " << Brows << "x" << Bcols << " NNZ: " << matB.nnz << " avg row: " << b_avg_row << "\n"; + std::cout << "expected row overlap: " << avg_row_overlap << " overallocation: " << OverallocationFactor << "\n"; + std::cout << "expected nnz: " << static_cast(round(output_estimate)) << " expected temp: " << static_cast(round(intermediate_estimate)) << " mem alloc: " << expectedNNZ << "\n"; + std::cout << "mergepointer alloc " << static_cast(ChunkPointerOverestimationFactor*mergepointer_estimate) << " mergepointer estimate: " << mergepointer_estimate << "\n"; + } + + HANDLE_ERROR(cudaGetLastError()); + + // CUDA variables + CUstream stream = 0; + int blockSize = 256; + int gridSize(divup(Arows + 1, blockSize)); + const int number_merge_streams = 3; + static CUstream mergeStreams[number_merge_streams]; + for (int i = 0; i < number_merge_streams; ++i) + { + if(stats.measure_all) + mergeStreams[i] = stream; + else + cudaStreamCreate(&mergeStreams[i]); + } + HANDLE_ERROR(cudaGetLastError()); + + cudaEvent_t ce_start, ce_stop, individual_start, individual_stop; + cudaEventCreate(&ce_start); cudaEventCreate(&ce_stop); cudaEventCreate(&individual_start); cudaEventCreate(&individual_stop); + HANDLE_ERROR(cudaGetLastError()); + // GPU Memory Helper structures - general + static ConsistentGPUMemory chunckPointers; + static ConsistentGPUMemory combinedGeneralMemory; + static ConsistentGPUMemory chunk_counter_cptr; + uint32_t* chunckAllocations{ nullptr }; + uint32_t* blockStarts{ nullptr }; + uint32_t* sharedRowTracker{ nullptr }; + void** outputRowListHead{ nullptr }; + uint32_t* outputRowChunkCounter{ nullptr }; + uint32_t* completion_status{ nullptr }; + uint32_t* chunk_counter{ nullptr }; + void* prefixSumTemp{ nullptr }; + + // GPU Memory Helper structures - merge stage allocation + static ConsistentGPUMemory combineBlockOffsets; // SIZE: combineBlockOffsetsSize * sizeof(IndexType) + + static ConsistentGPUMemory chunk_indices_cptr; // SIZE: ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * 8 + static ConsistentGPUMemory chunk_values_cptr; // SIZE: ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * 8 + //FIXME: RL - This is no longer *8, but sizeof(Either). Probably *16 because alignment. this shoudln't matter? + //FIXME: till confirmed/tested irrelevant + + static ConsistentGPUMemory chunk_multiplier_cptr; // SIZE: ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * 8 + + static ConsistentGPUMemory combinedMergeStageMemory; + static uint32_t* shared_rows_handled{ nullptr }; + static uint32_t* restart_completion{ nullptr }; + static uint32_t* chunkElementConsumedAndPath{ nullptr }; + uint32_t* num_chunks{ nullptr }; + uint32_t* chunkElementCountDataOffset{ nullptr }; + uint32_t* sample_offset{ nullptr }; + static IndexType** chunk_indices{ nullptr }; + static Either* chunk_values{ nullptr }; + static typename SEMIRING_t::leftInput_t* chunk_multiplier{ nullptr }; + HANDLE_ERROR(cudaDeviceSynchronize()); + + + // CPU Memory Helper structures + static RegisteredMemoryVar chunkPointerSize(0); + static RegisteredMemoryVar outputRowInfoSize(0); + static RegisteredMemoryVar prefixSumTempMemSize; + static RegisteredMemoryVar combineBlockOffsetsSize(0); + static RegisteredMemoryVar mergeBlocksAlloc(0); + static RegisteredMemoryVar lastSharedRows(0); + static RegisteredMemoryVar merge_simple_rows(0); + static RegisteredMemoryVar merge_max_chunks_rows(0); + static RegisteredMemoryVar merge_generalized_rows(0); + uint32_t flagsAndListAllocCounters[numFlags + numCounters]; + size_t tempChunkBufferSizes[256]; + CU::unique_ptr tempChunkBuffers[256]; + tempChunkBufferSizes[0] = alignment(size_to_allocate, 16); + // + // TSOPF_RS_b300_c2.mtx shows very weird results if this is done here?? + // + // Allocate temporary memory for chunks + tempChunkBuffers[0] = CU::allocMemory(tempChunkBufferSizes[0]); + + HANDLE_ERROR(cudaDeviceSynchronize()); + + // ############################## + startTimer(ce_start, stream); + // ############################## + if(stats.measure_all) + startTimer(individual_start, stream); + + + // Allocate memory for block offsets + uint32_t requiredBlocks = divup(matA.nnz, nnzperblock); + HANDLE_ERROR(cudaDeviceSynchronize()); + + // Allocate memory for chunk and shared row tracker + if (outputRowInfoSize < Crows) + { + //---------------------------------------------------------- + prefixSumTempMemSize = spgemm.tempMemSize(Crows); + //---------------------------------------------------------- + outputRowInfoSize = Crows; + } + + HANDLE_ERROR(cudaGetLastError()); + + + // Allocate combined general memory + size_t combinedGeneralMemory_size = + /*chunckAllocations*/alignment((chunckAllocationsSize + numFlags + numCounters + mergeTypeCounters) * sizeof(uint32_t), 8) + + /*blockStarts*/ alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) + + /*completion_status*/ alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) + + ///*chunk_counter*/ alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) + + /*outputRowListHead*/ alignment(Crows * sizeof(void*), 8) + + /*outputRowChunkCounter*/ alignment(Crows * sizeof(uint32_t), 8) + + /*sharedRowTracker*/ alignment(Crows * sizeof(uint32_t), 8) + + /*prefixSumTemp*/ alignment(static_cast(prefixSumTempMemSize), 8); + combinedGeneralMemory.assure(combinedGeneralMemory_size); + memory_usage_in_Bytes += combinedGeneralMemory_size; + + // Place pointers in correct positions + outputRowListHead = combinedGeneralMemory.get(); + chunckAllocations = reinterpret_cast(outputRowListHead + (alignment(Crows * sizeof(void*), 8) / sizeof(void*))); + completion_status = chunckAllocations + alignment((chunckAllocationsSize + numFlags + numCounters + mergeTypeCounters) * sizeof(uint32_t), 8) / sizeof(uint32_t); + /*chunk_counter = completion_status + (alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) / sizeof(uint32_t));*/ + blockStarts = completion_status + (alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) / sizeof(uint32_t)); + outputRowChunkCounter = blockStarts + (alignment((requiredBlocks + 2) * sizeof(uint32_t), 8) / sizeof(uint32_t)); + sharedRowTracker = outputRowChunkCounter + (alignment(Crows * sizeof(uint32_t), 8) / sizeof(uint32_t)); + prefixSumTemp = reinterpret_cast(sharedRowTracker + (alignment(Crows * sizeof(uint32_t), 8) / sizeof(uint32_t))); + HANDLE_ERROR(cudaGetLastError()); + + // TODO: Move back in, currently sometimes produces crashes for whatever reason + chunk_counter_cptr.assure((requiredBlocks + 2) * sizeof(uint32_t)); + chunk_counter = chunk_counter_cptr.get(); + //std::cout << "MADE IT IN " << std::endl; + // Allocate memory for chunk pointers + size_t targetChunkPointerSize =ChunkPointerOverestimationFactor*mergepointer_estimate; //fixme : rl + if (chunkPointerSize < targetChunkPointerSize) + { + chunkPointerSize = targetChunkPointerSize; + chunckPointers.assure((targetChunkPointerSize) * sizeof(void*)); + memory_usage_in_Bytes += (targetChunkPointerSize) * sizeof(void*); + } + + // Allocate memory for offsets + CU::unique_ptr newmat_offsets; + if (matOut.rows != Crows) + { + newmat_offsets = CU::allocMemory((Crows + 1) * sizeof(OffsetType)); + + memory_usage_in_Bytes += (Crows + 1) * sizeof(OffsetType); + } + else + { + newmat_offsets.consume(reinterpret_cast(matOut.row_offsets)); + matOut.row_offsets = nullptr; + } + HANDLE_ERROR(cudaDeviceSynchronize()); + + spgemm.setLaunchDimensions(gridSize, stream, blockSize); + HANDLE_ERROR(cudaDeviceSynchronize()); + //---------------------------------------------------------- + spgemm.h_DetermineBlockStarts( + Arows, + matA.row_offsets, + blockStarts, + reinterpret_cast(outputRowListHead), + outputRowChunkCounter, + newmat_offsets.get(), + requiredBlocks, + completion_status, + (chunckAllocationsSize + numFlags + numCounters + mergeTypeCounters), + chunckAllocations, + (lastSharedRows), + shared_rows_handled, + restart_completion, + chunk_counter, + (lastSharedRows) * (generalized_merge_max_path_options + helper_overhead), + chunkElementConsumedAndPath + ); + HANDLE_ERROR(cudaDeviceSynchronize()); + //---------------------------------------------------------- + if(stats.measure_all) + stats.duration_blockstarts = recordTimer(individual_start, individual_stop, stream); + HANDLE_ERROR(cudaGetLastError()); + do + { + HANDLE_ERROR(cudaDeviceSynchronize()); + currentChunckAllocation = chunckAllocations + (2 * run); + currentFlag = chunckAllocations + (chunckAllocationsSize + run + chunk_pointer_restart_run); + currentCounters = chunckAllocations + (chunckAllocationsSize + numFlags); + if (!rowmerging) + { + if(DEBUG_MODE) + { + std::cout << "################################################\n"; + std::cout << "Start spgemm stage with " << requiredBlocks<< " and run: " << run << "\n"; + } + if(stats.measure_all) + startTimer(individual_start, stream); + + // $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ + // Stage 2 - Compute SpGEMM + // $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ + spgemm.setLaunchDimensions(requiredBlocks, stream, threads); + HANDLE_ERROR(cudaDeviceSynchronize()); + if (Arows < 0x10000 && Bcols < 0x10000) + { + if(DEBUG_MODE) + { + std::cout << "Case 1:\n"; + } + HANDLE_ERROR(cudaGetLastError()); + cudaDeviceSynchronize(); + + //we can just use 16bit + //---------------------------------------------------------- + spgemm.h_computeSpgemmPart( + matA.data, matA.col_ids, matA.row_offsets, + matB.data, matB.col_ids, matB.row_offsets, + blockStarts, matA.nnz, Arows, + tempChunkBuffers[run].get(), currentChunckAllocation, currentChunckAllocation + 1, tempChunkBufferSizes[run], + chunckPointers.get(), currentCounters, chunkPointerSize, + newmat_offsets.get(), outputRowListHead, outputRowChunkCounter, + sharedRowTracker, currentCounters + 1, avg_row_overlap, 1.0f / avg_row_overlap, + currentFlag, completion_status, chunk_counter, currentCounters + 2, semiring); + //---------------------------------------------------------- + cudaDeviceSynchronize(); + + HANDLE_ERROR(cudaGetLastError()); + } + else if (Bcols < (1ull << LZCNT(nnz_per_thread*threads)) - 1) + { + if(DEBUG_MODE) + { + std::cout << "Case 2:\n"; + } + HANDLE_ERROR(cudaDeviceSynchronize()); + //remap every local row to reduce bit count and use remaining for col ids + //---------------------------------------------------------- + HANDLE_ERROR(cudaGetLastError()); + + HANDLE_ERROR(cudaDeviceSynchronize()); + uint32_t* tempC = tempChunkBuffers[run].get(); + HANDLE_ERROR(cudaGetLastError()); + + void** chunckP = chunckPointers.get(); + HANDLE_ERROR(cudaGetLastError()); + + OffsetType* nmat_f = newmat_offsets.get(); + HANDLE_ERROR(cudaGetLastError()); + + spgemm.h_computeSpgemmPart( + matA.data, matA.col_ids, matA.row_offsets, + matB.data, matB.col_ids, matB.row_offsets, + blockStarts, matA.nnz, Arows, + tempC, currentChunckAllocation, currentChunckAllocation + 1, tempChunkBufferSizes[run], + chunckP, currentCounters, chunkPointerSize, + nmat_f, outputRowListHead, outputRowChunkCounter, + sharedRowTracker, currentCounters + 1, avg_row_overlap, 1.0f / avg_row_overlap, + currentFlag, completion_status, chunk_counter, currentCounters + 2, semiring); + //---------------------------------------------------------- + cudaDeviceSynchronize(); + HANDLE_ERROR(cudaGetLastError()); + } + else + { + if(DEBUG_MODE) + { + std::cout << "Case 3:\n"; + } + HANDLE_ERROR(cudaGetLastError()); + cudaDeviceSynchronize(); + //---------------------------------------------------------- + spgemm.h_computeSpgemmPart( + matA.data, matA.col_ids, matA.row_offsets, + matB.data, matB.col_ids, matB.row_offsets, + blockStarts, matA.nnz, Arows, + tempChunkBuffers[run].get(), currentChunckAllocation, currentChunckAllocation + 1, tempChunkBufferSizes[run], + chunckPointers.get(), currentCounters, chunkPointerSize, + newmat_offsets.get(), outputRowListHead, outputRowChunkCounter, + sharedRowTracker, currentCounters + 1, avg_row_overlap, 1.0f / avg_row_overlap, + currentFlag, completion_status, chunk_counter, currentCounters + 2,semiring); + //---------------------------------------------------------- + cudaDeviceSynchronize(); + HANDLE_ERROR(cudaGetLastError()); + } + // if (cudaDeviceSynchronize() != cudaSuccess) { + // throw SpGEMMException(); + // } + if(stats.measure_all) + stats.duration_spgemm += recordTimer(individual_start, individual_stop, stream); + } + else + { + + if(DEBUG_MODE) + { + std::cout << "################################################\n"; + std::cout << "Start Merge Stage\n"; + } + uint32_t simple_restart_offset = 0; + uint32_t max_chunks_restart_offset = mergeBlocks.shared_rows_simple; + uint32_t generalized_restart_offset = mergeBlocks.shared_rows_simple + mergeBlocks.shared_rows_max_chunks; + // Simple Case -> Output fits in shared + if (mergeBlocks.shared_rows_simple) + { + if(stats.measure_all) + startTimer(individual_start, mergeStreams[0]); + + spgemm.setLaunchDimensions(mergeBlocks.shared_rows_simple, mergeStreams[0], threads); + if (Bcols < 1ull << LZCNT(threads - 1)) + { + if (DEBUG_MODE) + { + std::cout << "Case: 1\n"; + } + //---------------------------------------------------------- + spgemm.h_mergeSharedRowsSimple< nnz_per_thread, threads, blocks_per_mp, input_elements_per_thread, retain_elements_per_thread, merge_max_chunks, merge_max_path_options, typename SEMIRING_t::output_t, IndexType, OffsetType, false,T,U,Label, SEMIRING_t>( + combineBlockOffsets.get() + (3 * numSharedRows), combineBlockOffsets.get(), outputRowListHead, + newmat_offsets.get(), + tempChunkBuffers[run].get(), currentChunckAllocation, NULL, tempChunkBufferSizes[run], + chunckPointers.get(), currentCounters, chunkPointerSize, + currentFlag, restart_completion, shared_rows_handled, simple_restart_offset, currentCounters + 2, semiring + ); + //---------------------------------------------------------- + } + else + { + if (DEBUG_MODE) + { + std::cout << "Case: 2\n"; + } + //---------------------------------------------------------- + spgemm.h_mergeSharedRowsSimple< nnz_per_thread, threads, blocks_per_mp, input_elements_per_thread, retain_elements_per_thread, merge_max_chunks, merge_max_path_options, typename SEMIRING_t::output_t, IndexType, OffsetType, true,T,U,Label, SEMIRING_t>( + combineBlockOffsets.get() + (3 * numSharedRows), combineBlockOffsets.get(), outputRowListHead, + newmat_offsets.get(), + tempChunkBuffers[run].get(), currentChunckAllocation, NULL, tempChunkBufferSizes[run], + chunckPointers.get(), currentCounters, chunkPointerSize, + currentFlag, restart_completion, shared_rows_handled, simple_restart_offset, currentCounters + 2,semiring + ); + //---------------------------------------------------------- + } + // if (cudaDeviceSynchronize() != cudaSuccess) { + // throw MergeSimpleCaseException(); + // } + if(stats.measure_all) + stats.duration_merge_simple += recordTimer(individual_start, individual_stop, mergeStreams[0]); + } + HANDLE_ERROR(cudaGetLastError()); + // Complex Case -> Output gets merged through paths over MAX_CHUNKS + if (mergeBlocks.shared_rows_max_chunks) + { + if (DEBUG_MODE) + { + std::cout << "Case: 4\n"; + } + if(stats.measure_all) + startTimer(individual_start, mergeStreams[1]); + spgemm.setLaunchDimensions(mergeBlocks.shared_rows_max_chunks, mergeStreams[1], threads); + //---------------------------------------------------------- + spgemm.h_mergeSharedRowsMaxChunks ( + NULL, combineBlockOffsets.get() + (1 * numSharedRows), outputRowListHead, + newmat_offsets.get(), + tempChunkBuffers[run].get(), currentChunckAllocation, NULL, tempChunkBufferSizes[run], + chunckPointers.get(), currentCounters, chunkPointerSize, + currentFlag, restart_completion, shared_rows_handled, + chunk_indices, chunk_values, chunk_multiplier, + chunkElementCountDataOffset, max_chunks_restart_offset, num_chunks, currentCounters + 2, semiring); + //---------------------------------------------------------- + // if (cudaDeviceSynchronize() != cudaSuccess) { + // throw MergeMaxChunksCaseException(); + // } + if(stats.measure_all) + stats.duration_merge_max += recordTimer(individual_start, individual_stop, mergeStreams[1]); + } + HANDLE_ERROR(cudaGetLastError()); + // General Case -> Handles cases with more than MAX_CHUNKS chunks + if (mergeBlocks.shared_rows_generalized) + { + if (DEBUG_MODE) + { + std::cout << "Case: 5\n"; + } + if(stats.measure_all) + startTimer(individual_start, mergeStreams[2]); + spgemm.setLaunchDimensions(mergeBlocks.shared_rows_generalized, mergeStreams[2], threads); + //---------------------------------------------------------- + spgemm.h_mergeSharedRowsGeneralized( + NULL, combineBlockOffsets.get() + (2 * numSharedRows), outputRowListHead, + newmat_offsets.get(), + tempChunkBuffers[run].get(), currentChunckAllocation, NULL, tempChunkBufferSizes[run], + chunckPointers.get(), currentCounters, chunkPointerSize, + currentFlag, restart_completion, shared_rows_handled, + sample_offset, chunkElementConsumedAndPath, generalized_restart_offset, currentCounters + 2, + semiring + ); + //---------------------------------------------------------- + // if (cudaDeviceSynchronize() != cudaSuccess) { + // throw MergeGeneralizedCaseException(); + // } + if(stats.measure_all) + stats.duration_merge_generalized += recordTimer(individual_start, individual_stop, mergeStreams[2]); + } + } + //HANDLE_ERROR(cudaGetLastError()); + // // Copy back flags + /*cudaPointerAttributes attr; + for (int i = 0; i < numFlags+numCounters; ++i) { + HANDLE_ERROR(cudaPointerGetAttributes(&attr, chunckAllocations + chunckAllocationsSize + i)); + uint32_t test = flagsAndListAllocCounters[i]; + if(attr.type != 2) std::cout << attr.type << std::endl; + }*/ + + //std::cout << "FLAG COPY " << id << std::endl; + /*for (int i = 0; i < (numFlags + numCounters); ++i) + { + cudaPointerAttributes attr; + cudaError_t error = cudaPointerGetAttributes(&attr, chunckAllocations + chunckAllocationsSize + i); + + if (error != cudaSuccess) + { + std::cerr << "Error getting pointer attributes: " + << cudaGetErrorString(error) << std::endl; + } + else if (attr.type != cudaMemoryTypeDevice && id == 0) + { + std::cout << "ERR" << std::endl; // This should never happen + } + }*/ + //MPI_Barrier(MPI_COMM_WORLD); // Synchronize after CUDA operations + HANDLE_ERROR(cudaGetLastError()); + HANDLE_ERROR(cudaMemcpy(flagsAndListAllocCounters, chunckAllocations + chunckAllocationsSize, (numFlags + numCounters) * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + //MPI_Barrier(MPI_COMM_WORLD); // Synchronize after CUDA operations + //std::cout << "FLAG COPY DONE " << id << std::endl; + completed = flagsAndListAllocCounters[run + chunk_pointer_restart_run] == 0; + + if (!completed) + { + // if (stats.measure_all && stats.duration_merge_simple + stats.duration_merge_max + stats.duration_merge_generalized > 10000) + // throw MergeLoopingException(); + + + uint32_t return_value = flagsAndListAllocCounters[run + chunk_pointer_restart_run]; + if (UintBitSet(return_value).test(0)) + { + if (DEBUG_MODE) + { + std::cout << "Chunk Memory Restart allocating space for " << tempChunkBufferSizes[run] / (sizeof(typename SEMIRING_t::rightInput_t) + sizeof(IndexType)) << " elements\n"; + } + // Get more chunk memory + auto new_buffer_size = tempChunkBufferSizes[run]; + tempChunkBufferSizes[run+1] = new_buffer_size; + tempChunkBuffers[run+1] = CU::allocMemory(new_buffer_size); + if (++run == chunckAllocationsSize / 2) { + std::cout << "Out of memory " << std::endl; + throw RestartOutOfMemoryException(); + } + HANDLE_ERROR(cudaGetLastError()); + } + if (UintBitSet(return_value).test(1)) + { + if (DEBUG_MODE) + { + std::cout << "Chunk Pointer Restart allocating " << targetChunkPointerSize << " new pointers\n"; + } + // Get more chunk pointers + chunkPointerSize += targetChunkPointerSize; + chunckPointers.increaseMemRetainData((targetChunkPointerSize) * 8); + targetChunkPointerSize *= 2; + if (++chunk_pointer_restart_run == chunckAllocationsSize / 2) + throw RestartOutOfChunkPointerException(); + HANDLE_ERROR(cudaMemcpy(currentCounters, currentCounters + 2, sizeof(uint32_t), cudaMemcpyDeviceToDevice)); + } + } + if (completed && !rowmerging) + { + numSharedRows = flagsAndListAllocCounters[numFlags + 1]; + if (numSharedRows > 0) + { + if(stats.measure_all) + startTimer(individual_start, stream); + + if (combineBlockOffsetsSize < 4 * (numSharedRows + 1)) + { + combineBlockOffsetsSize = 4 * (numSharedRows + 1024); + combineBlockOffsets.assure(combineBlockOffsetsSize * sizeof(IndexType)); + memory_usage_in_Bytes += combineBlockOffsetsSize * sizeof(IndexType); + } + CUdeviceptr mergeTypeCounters = reinterpret_cast(chunckAllocations) + 4 * (chunckAllocationsSize + numFlags + numCounters); + + //---------------------------------------------------------- + mergeBlocks = spgemm.assignCombineBlocks(numSharedRows, prefixSumTemp, prefixSumTempMemSize, sharedRowTracker, newmat_offsets, outputRowChunkCounter, combineBlockOffsets, mergeTypeCounters, stream); + //---------------------------------------------------------- + + completed = false; + rowmerging = true; + + if(DEBUG_MODE) + { + std::cout << "################################################\n"; + std::cout << "Assigned " << numSharedRows << " shared rows to blocks, starting \n\t\t" + << mergeBlocks.shared_rows_simple << " simple merges for " << mergeBlocks.shared_rows_simple_rows << " rows,\n\t\t" + << mergeBlocks.shared_rows_max_chunks << " max chunk mergers, and\n\t\t" + << mergeBlocks.shared_rows_generalized << " general mergers\n"; + } + + // Set merge stage row stats + stats.shared_rows = numSharedRows; + stats.simple_mergers = mergeBlocks.shared_rows_simple; + stats.simple_rows = mergeBlocks.shared_rows_simple_rows; + stats.complex_rows = mergeBlocks.shared_rows_max_chunks; + stats.generalized_rows = mergeBlocks.shared_rows_generalized; + merge_simple_rows = mergeBlocks.shared_rows_simple; + merge_max_chunks_rows = mergeBlocks.shared_rows_max_chunks; + merge_generalized_rows = mergeBlocks.shared_rows_generalized; + + // Allocate memory for all helper data structures + size_t combinedMergeStageMemory_size = + /*shared_rows_handled*/((numSharedRows) * sizeof(uint32_t)) + + /*restart_completion*/((numSharedRows) * sizeof(uint32_t)) + + /*chunkElementConsumedAndPath*/((numSharedRows) * (generalized_merge_max_path_options + helper_overhead) * sizeof(uint32_t)) + + /*chunkElementCountDataOffset*/(((numSharedRows) * merge_max_chunks) * sizeof(uint32_t)) + + /*num_chunks*/((numSharedRows) * sizeof(uint32_t)) + + /*sample_offset*/(((numSharedRows) * (threads) * sizeof(uint32_t))); //+ + ///* chunk_indices*/(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(IndexType*)) + + ///*chunk_values*/(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(typename SEMIRING_t::input_t*)) + + ///*chunk_multiplier*/(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(typename SEMIRING_t::input_t)); + combinedMergeStageMemory.assure(combinedMergeStageMemory_size); + memory_usage_in_Bytes += combinedMergeStageMemory_size; + HANDLE_ERROR(cudaGetLastError()); + //// Place pointers in memory allocation + shared_rows_handled = combinedMergeStageMemory.get(); + restart_completion = shared_rows_handled + (numSharedRows); + chunkElementConsumedAndPath = restart_completion + (numSharedRows); + chunkElementCountDataOffset = chunkElementConsumedAndPath + (numSharedRows) * (generalized_merge_max_path_options + helper_overhead); + num_chunks = chunkElementCountDataOffset + ((numSharedRows) * merge_max_chunks); + sample_offset = num_chunks + (numSharedRows); + + // TODO: Why does this work?????????????????????????? + chunk_indices_cptr.assure(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(IndexType*)); + chunk_indices = chunk_indices_cptr.get(); + chunk_values_cptr.assure(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof( Either)); + chunk_values = chunk_values_cptr.get< Either>(); + chunk_multiplier_cptr.assure(((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(typename SEMIRING_t::leftInput_t)); + chunk_multiplier = chunk_multiplier_cptr.get(); + + + // TODO: Why does this NOT work?????????????????????????? + /*chunk_indices = reinterpret_cast(chunk_multiplier + ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks));*/ + /*chunk_values = reinterpret_cast(chunk_indices + ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks));*/ + // chunk_multiplier = reinterpret_cast(sample_offset + ((numSharedRows) * (threads))); + + memory_usage_in_Bytes += ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(IndexType*); + memory_usage_in_Bytes += ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(Either); + memory_usage_in_Bytes += ((mergeBlocks.shared_rows_max_chunks) * merge_max_chunks) * sizeof(typename SEMIRING_t::rightInput_t); + + if (numSharedRows > lastSharedRows) + { + cudaMemset(combinedMergeStageMemory.get(), 0, + /*chunkElementConsumedAndPath*/((numSharedRows) * (generalized_merge_max_path_options + helper_overhead) * sizeof(uint32_t)) + + /*shared_rows_handled*/((numSharedRows) * sizeof(uint32_t)) + + /*restart_completion*/((numSharedRows) * sizeof(uint32_t)) + ); + lastSharedRows = numSharedRows; + } + if(stats.measure_all) + stats.duration_merge_case_computation = recordTimer(individual_start, individual_stop, stream); + HANDLE_ERROR(cudaGetLastError()); + } + } + } while (!completed); + //std::cout << "WE OUT" << std::endl; + //delete[] flagsAndListAllocCounters; + // Let's write the chunks out to a csr matrix + if(stats.measure_all) + startTimer(individual_start, stream); + + //---------------------------------------------------------- + spgemm.computeRowOffsets(Crows, prefixSumTemp, prefixSumTempMemSize, newmat_offsets, stream); + //---------------------------------------------------------- + + // Allocate output matrix + IndexType matrix_elements; + CUdeviceptr offs = newmat_offsets; + offs += sizeof(IndexType) * Crows; + HANDLE_ERROR(cudaMemcpy(&matrix_elements, reinterpret_cast(offs), sizeof(IndexType), cudaMemcpyDeviceToHost)); + + if (matOut.nnz != matrix_elements) + { + //std::cout << "Reallocation HERE ################" << matOut.nnz << " | " << matrix_elements <<"\n"; + matOut.alloc(Crows, Ccols, matrix_elements, false); + } + matOut.row_offsets = std::move(newmat_offsets.getRelease()); + + //---------------------------------------------------------- + spgemm.h_copyChunks(chunckPointers.get(), currentCounters, + matOut.data, matOut.col_ids, matOut.row_offsets); + //---------------------------------------------------------- + if(stats.measure_all) + stats.duration_write_csr = recordTimer(individual_start, individual_stop, stream); + + if (stats.measure_all) + { + stats.mem_allocated_chunks = tempChunkBufferSizes[0] * (run + 1); + uint32_t* d_current_chunk_allocation = chunckAllocations + (2 * run); + uint32_t h_current_chunk_allocation = 0; + HANDLE_ERROR(cudaMemcpy(&h_current_chunk_allocation, d_current_chunk_allocation, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + stats.mem_used_chunks = tempChunkBufferSizes[0] * run + h_current_chunk_allocation; + } + stats.restarts = run + chunk_pointer_restart_run; + HANDLE_ERROR(cudaGetLastError()); + // ############################## + stats.duration = recordTimer(ce_start, ce_stop, stream); + // ############################## + + // Stream cleanup + if (!(stats.measure_all)) + { + for (int i = 0; i < number_merge_streams; ++i) + cudaStreamDestroy(mergeStreams[i]); + } + HANDLE_ERROR(cudaGetLastError()); + return; + } + + template + struct Selection + { + CB& cb; + Selection(CB& cb) : cb(cb) {} + }; + + template + struct CallSelection + { + static void call(CB &cb) + { + cb. template call(); + } + }; + + struct EnumFin + { + template + static bool call(Selection cb) + { + CallSelection::call(cb.cb); + return true; + } + }; + + template + struct EnumOption + { + template + static bool call(Selection cb, int value, TYPES... values) + { + if (value == CURRENT) + { + return NEXT::call(Selection(cb.cb), values...); + } + else + return EnumOption::call(cb, value, values...); + } + }; + + template + struct EnumOption + { + template + static bool call(Selection cb, int value, TYPES... values) + { + if (value == MAX) + { + return NEXT::call(Selection(cb.cb), values...); + } + else + return false; + } + }; + + + template + struct MultiplyCall + { + const dCSR& A; + const dCSR& B; + dCSR &matOut; + SEMIRING_t semiring; + + const GPUMatrixMatrixMultiplyTraits& scheduling_traits; + ExecutionStats& exec_stats; + + MultiplyCall(const dCSR& A, const dCSR& B, dCSR& matOut, const GPUMatrixMatrixMultiplyTraits& scheduling_traits, ExecutionStats& exec_stats, SEMIRING_t semiring) : + A(A), B(B), matOut(matOut), scheduling_traits(scheduling_traits), exec_stats(exec_stats), semiring(semiring) + { + + } + + template + void call() + { + const int RealBlocksPerMP = (256 * BlocksPerMP + Threads - 1) / Threads; + ACSpGEMM::MultiplyImplementation(A, B, matOut, scheduling_traits, exec_stats,semiring); + } + }; + + + template < typename SEMIRING_t> + void Multiply(const dCSR& A, const dCSR& B, dCSR& matOut, const GPUMatrixMatrixMultiplyTraits& scheduling_traits, ExecutionStats& exec_stats, bool DEBUG_MODE, SEMIRING_t semiring) + { + HANDLE_ERROR(cudaGetLastError()); + MultiplyCall call(A, B, matOut, scheduling_traits, exec_stats, semiring); + HANDLE_ERROR(cudaGetLastError()); + + bool called = EnumOption<128, 256, 128, + EnumOption<1, 1, 1, + EnumOption<2, 2,2, + EnumOption<2, 2, 2, + EnumOption<1, 1, 1, + EnumOption<16, 16, 8, + EnumOption<512, 512, 256, + EnumOption<8, 8, 8, + EnumOption<0, 1, 1>>>>>>>>> + ::call(Selection>(call), scheduling_traits.Threads, scheduling_traits.BlocksPerMp, scheduling_traits.NNZPerThread, scheduling_traits.InputElementsPerThreads, scheduling_traits.RetainElementsPerThreads, scheduling_traits.MaxChunksToMerge, scheduling_traits.MaxChunksGeneralizedMerge, scheduling_traits.MergePathOptions, (int)DEBUG_MODE); + if(!called) + { + std::cout << "Configuration not instantiated!\n"; + } + }; +} + diff --git a/include/GALATIC/source/device/memory.cuh b/include/GALATIC/source/device/memory.cuh new file mode 100644 index 00000000..2103e0a2 --- /dev/null +++ b/include/GALATIC/source/device/memory.cuh @@ -0,0 +1,63 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +#pragma once + +#include "../../include/devicetools/error.h" +#include "../../include/devicetools/memory.h" +#include +#include + + + +namespace CU +{ + unique_ptr allocMemory(std::size_t size) + { + CUdeviceptr ptr; + cudaMalloc(reinterpret_cast(&ptr), size); + return unique_ptr(ptr); + } + + unique_ptr allocMemoryPitched(std::size_t& pitch, std::size_t row_size, std::size_t num_rows, unsigned int element_size) + { + CUdeviceptr ptr; + cudaMallocPitch(reinterpret_cast(&ptr), &pitch, row_size, num_rows); + return unique_ptr(ptr); + } + + pitched_memory allocMemoryPitched(std::size_t row_size, std::size_t num_rows, unsigned int element_size) + { + CUdeviceptr ptr; + std::size_t pitch; + cudaMallocPitch(reinterpret_cast(&ptr), &pitch, row_size, num_rows); + return pitched_memory(unique_ptr(ptr), pitch); + } +} diff --git a/include/GALATIC/source/main.cu b/include/GALATIC/source/main.cu new file mode 100644 index 00000000..616951c2 --- /dev/null +++ b/include/GALATIC/source/main.cu @@ -0,0 +1,436 @@ +// Project AC-SpGEMM +// https://www.tugraz.at/institute/icg/research/team-steinberger/ +// +// Copyright (C) 2018 Institute for Computer Graphics and Vision, +// Graz University of Technology +// +// Author(s): Martin Winter - martin.winter (at) icg.tugraz.at +// Daniel Mlakar - daniel.mlakar (at) icg.tugraz.at +// Rhaleb Zayer - rzayer (at) mpi-inf.mpg.de +// Hans-Peter Seidel - hpseidel (at) mpi-inf.mpg.de +// Markus Steinberger - steinberger ( at ) icg.tugraz.at +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +/*!/------------------------------------------------------------------------------ + * Main.cpp + * + * ac-SpGEMM + * + * Authors: Daniel Mlakar, Markus Steinberger, Martin Winter + *------------------------------------------------------------------------------ +*/ + +// Global includes +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +// Local includes +#include "CSR.cuh" +#include "COO.cuh" +#include "Vector.h" +#include "dCSR.cuh" +#include "dVector.h" +#include "device/Multiply.cuh" +#include "Transpose.h" +#include "Compare.cuh" +#include "CPU_SpGEMM.h" +// CuSparse include +//#include "cusparse/include/cuSparseMultiply.h" + +// // Nsparse include +// #ifndef NONSPARSE +// #include "nsparse/include/nsparseMultiply.h" +// #endif + +// // RMerge include +// #ifndef NORMERGE +// #include "RMerge/include/rmergeMultiply.h" +// #endif +// const uint64_t max(uint64_t x, uint64_t y) { +// return x < y ? x :y; +// } +// // BhSparse include +// #ifndef NOBHSPARSE +// #include"bhSparse/include/bhSparseMultiply.h" +// #endif + +//foo::foo(int x) { +// this->a =x; +// +//} +template +void log_good(T& s) { + std::cout << "\033[1;31," << s << "\033[0m"; +} + +struct triv {}; + +struct mr2 { + int16_t val; + uint8_t temp; + uint8_t temp2; + + uint8_t temp3; + + uint8_t temp4; + + uint8_t temp5; + +}; + +struct MinRing : SemiRing { + int16_t val; + int16_t val2; + + + // __device__ __host__ MinRing(int32_t x, int32_t y) { + // val = x; + // } + // __device__ __host__ MinRing(int32_t x) { + // val = x; + // } + + // __device__ __host__ ~MinRing() { + // } + + // __device__ __host__ MinRing() { + // val = INT16_MIN; + // } + + static MinRing Init(double x) { + MinRing res; + res.val = (short) x; + return res; + } + __device__ __host__ mr2 multiply( MinRing & a, MinRing & b) const { + return mr2 { static_cast(a.val == INT16_MAX || b.val == INT16_MAX ? INT16_MAX : a.val + b.val ),0}; + } + __device__ __host__ mr2 add(const mr2 & a, const mr2 & b)const { + + return mr2 { a.val < b.val ? a.val : b.val,0} ; + } + + __device__ bool operator==(const MinRing& rhs) const + { + return val == rhs.val; + } + + static __host__ __device__ MinRing MultiplicativeIdentity() { + MinRing result; + result.val = 0; + return result; + } + static __host__ __device__ mr2 AdditiveIdentity() { + return mr2 { INT16_MAX ,0}; + } +}; + + + + + + +unsigned int padding = 0; +template +std::string typeext() { + //FIXME not-C++ standard compliant + return typeid(T).name(); +} +template<> +std::string typeext() +{ + return std::string(""); +} +template<> std::string typeext() +{ + return std::string("i32_"); +} +template<> +std::string typeext() +{ + return std::string("d_"); +} + +void printCheckMark() +{ + printf("\n #\n #\n #\n # #\n # #\n #\n\n"); +} + +void printCross() +{ + printf("\n # # \n # # \n # # \n # \n # # \n # # \n # # \n\n"); +} + +int main(int argc, char *argv[]) +{ + + + std::cout << "########## ac-SpGEMM ##########" << std::endl; + + char *filename; + bool print_stats{ false }; + if (argc == 1) + { + std::cout << "Require filename of .mtx as first argument" << std::endl; + return -1; + } + + filename = argv[1]; + + int device = 0; + // if (argc >= 3) + // device = std::stoi(argv[2]); + + bool testing = false; + // if(argc >= 4) + // testing = std::stoi(argv[3]) > 0 ? true : false; + + cudaSetDevice(device); + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, device); + std::cout << "Going to use " << prop.name << " " << prop.major << "." << prop.minor << "\n"; + + // CSR matrices on the device + CSR csr_mat, csr_T_mat, result_mat, test_mat; + dCSR dcsr_mat, dcsr_T_mat ;//, d_nsparse_result_mat, d_rmerge_result_mat, d_bhSparse_result_mat; + + dCSR d_result_mat_comp, d_result_mat; + //try load csr file + std::string csr_name = std::string(argv[1]) + typeext() + ".hicsr"; + try + { + std::cout << "trying to load csr file \"" << csr_name << "\"\n"; + csr_mat = loadCSR(csr_name.c_str()); + std::cout << "succesfully loaded: \"" << csr_name << "\"\n"; + } + catch (std::exception& ex) + { + std::cout << "could not load csr file:\n\t" << ex.what() << "\n"; + try + { + std::cout << "trying to load mtx file \"" << argv[1] << "\"\n"; + COO coo_mat= loadMTX(argv[1]); + // coo_mat.alloc(2,2,4); + // coo_mat.data[0]= MinRing::Init(1); + // coo_mat.data[1]= MinRing::Init(2); + // coo_mat.data[2]= MinRing::Init(3); + // coo_mat.data[3]= MinRing::Init(4); + + + // coo_mat.row_ids[0] = 0; + // coo_mat.col_ids[0] = 0; + + // coo_mat.row_ids[1] = 0; + // coo_mat.col_ids[1] = 1; + + + // coo_mat.row_ids[2] = 1; + // coo_mat.col_ids[2] = 0; + + // coo_mat.row_ids[3] = 1; + // coo_mat.col_ids[3] = 1; + + + + + convert(csr_mat, coo_mat); + std::cout << "succesfully loaded and converted: \"" << csr_name << "\"\n"; + } + catch (std::exception& ex) + { + std::cout << ex.what() << std::endl; + return -1; + } + try + { + std::cout << "write csr file for future use\n"; + storeCSR(csr_mat, csr_name.c_str()); + } + catch (std::exception& ex) + { + std::cout << ex.what() << std::endl; + } + } + + // Convert host csr to device csr + convert(dcsr_mat, csr_mat, padding); + + + + bool transpose = (dcsr_mat.rows != dcsr_mat.cols); + if (transpose) + { + std::cout << "Matrix not square (" << dcsr_mat.rows << "x" << dcsr_mat.cols << ") - Calculate Transpose!\n"; + /*ACSpGEMM::Transpose(dcsr_mat, dcsr_T_mat);*/ + convert(csr_T_mat, dcsr_T_mat, padding); + } + + printf("Input Matrix A: (%zu x %zu) - NNZ: %zu\n", dcsr_mat.rows, dcsr_mat.cols, dcsr_mat.nnz); + if(transpose) + printf("Input Matrix B: (%zu x %zu) - NNZ: %zu\n", dcsr_T_mat.rows, dcsr_T_mat.cols, dcsr_T_mat.nnz); + + + + + const int Threads = 128; + const int BlocksPerMP = 1; + const int NNZPerThread = 2; + const int InputElementsPerThreads = 2; + const int RetainElementsPerThreads = 1; + const int MaxChunksToMerge = 8; + const int MaxChunksGeneralizedMerge = 512; // MAX: 865 + const int MergePathOptions = 8; + + GPUMatrixMatrixMultiplyTraits DefaultTraits(Threads, BlocksPerMP, NNZPerThread, InputElementsPerThreads, RetainElementsPerThreads, MaxChunksToMerge, MaxChunksGeneralizedMerge, MergePathOptions); // DefaultTraits(128, 2, 4, 1, 8, 128, 8); + const bool Debug_Mode = true; + bool checkBitStability{true}; + DefaultTraits.preferLoadBalancing = true; + ExecutionStats stats, warmupstats, output_stats; + stats.measure_all = false; + output_stats.measure_all = false; + + uint32_t warmupiterations = testing ? checkBitStability ? 1 : 0: 20; + uint32_t iterations = testing ? 1 : 20; + + + + // Multiplication + /*if (testing) + std::cout << "Iteration: " << i + 1 << "\n";*/ + MinRing j = MinRing { }; + + std::cout << "Performing SpGEMM, GPU" << std::endl; + ACSpGEMM::Multiply(dcsr_mat, transpose ? dcsr_T_mat : dcsr_mat, d_result_mat, DefaultTraits, stats, Debug_Mode,j); + std::cout << "SpGEMM Done\n"; + + CSR out; + std::cout << "Performing SpGEMM, CPU" << std::endl; + + Mult_CPU(csr_mat, csr_mat, out, j); + std::cout << "CPU-SpGEMM Done\n"; + + std::ofstream log_f; + + + if(argc >= 3) + { + log_f.open(argv[2]); + } + + + + CSR coo_mat; + + convert(coo_mat, d_result_mat,0); + COO coo; + cudaDeviceSynchronize(); + + uint64_t err_count = 0; + uint64_t checked = 0; + if (coo_mat.nnz != out.nnz) { + if (argc >= 3) { + log_f << "ERROR:" << "nonzeros GPU: " << coo_mat.nnz << " vs non-zeros cpu:" << out.nnz <= 3) { + for (int i =0; i < coo_mat.nnz; i++) { + + if (coo_mat.data[i].val != out.data[i].val){ + log_f << "ERROR, NNZ Entry#: " << i << " (" << coo_mat.row_offsets[i] << ", " << coo_mat.col_ids[i] << ") gpu: " << coo_mat.data[i].val << " vs CPU: " << out.data[i].val << std::endl; + err_count++; + } else { + checked++;// this can be calulated from errocount, but I'm being paranoid to make sure we don't trivially pass + } + } + + + log_f << "Total errors: " <= 3) + { + log_f << "output NNZ checked" << coo_mat.nnz << std::endl; + std::cout << "NNZ correct / # of checked output: " << checked << "/" << coo_mat.nnz << std::endl; + + log_f.close(); + + } + + std::cout << "Total errors: " << err_count <(d_result_mat_comp, d_result_mat, true)) +// { +// printf("NOT Bit-Identical\n"); +// printCross(); +// exit(-1); +// } +// else +// { +// printf("Bit-Identical\n"); +// printCheckMark(); +// } + + + + + // output_stats.normalize(); + // std::cout << output_stats; + // std::cout << "-----------------------------------------------\n"; + + // if(checkBitStability) + // return 0; + + return 0; +} + + diff --git a/mfiles/betwCentrality.m b/mfiles/betwCentrality.m index 94d2b992..f32c23f5 100644 --- a/mfiles/betwCentrality.m +++ b/mfiles/betwCentrality.m @@ -1,266 +1,266 @@ -function bc = betwCentrality( G, K4approx, batchSize ) - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Function betwCentrality() - Kernel 4, analyze a graph's connectivity. -% -% The fourth computational kernel computes the betweenness centrality for -% an unweighted graph, using only matrix operations. Betweenness centrality -% is a measure of the importance of a vertex with respect to the shortest -% paths between other vertices in the graph that it lies on. This function -% computes an ordered list of centralities, each centrality corresponding -% to a specific vertex in the graph. -% -% The high computational cost of kernel 4: -% An exact implementation would consider all of the vertices as starting -% points in the betweenness centrality metric; this implementation -% can be 'dialed' to use a subset of starting vertices to obtain an -% approximation of the betweenness centrality. -% -% For a detailed description of the SCCA #2 graph analysis algorithm, -% please see SCCA #2 Graph Analysis Written Specification, V2.2. -% -% NOTES: -% -% This code is the vectorized version of the pseudo-code provided in the -% specification. It is designed to process a full level in the search tree -% at a time rather than just a single vertex. All of the operations are -% performed in he same way, only this code is able to perform them in -% parallel using sparse matrices and matrix operations. In addition, -% rather than processing a single vertex at a time, it has a configurable -% batch size parameter. While increasing the size of a batch increases the -% space required by the algorithm, it may also increase the performance. -% -% This uses Ulrik Brandes' Algorithm from "A faster algorithm for -% betweenness centrality", where variables are named in the following way: -% -% Ulrik Brandes This Code -% ----------------------------------------------------------------------- -% C_B bc -% P, d (unused) -% S, Q bfs -% s batch -% sigma nsp -% delta bcu -% -% S and Q can be stored using the same variable. This optimization can be -% performed in the original algorithm as well by simply using a vector for -% storage rather than a stack and a queue. Instead of discarding vertices -% from the top of Q, the vector pointer is advanced. The stack S -% corresponds to the vertices of the array in reverse order. -% -% bfs is stored as a matrix rather than a vector. Rather than looking at a -% single vertex at a time, all vertices at a particular depth are examined. -% -% D is not required. It was used previously to determine the -% distance between two vertices. In this implementation, this can be -% computed by looking at bfs. In addition, since all the nodes at a -% particular depth in the search are examined at the same time, all -% previously unseen vertices must be on shortest paths. -% -% P is computed rather than stored by selecting edges that go between -% vertices at neighboring depths. -% -% References: -% -% D.A. Bader and K. Madduri, "Parallel Algorithms for Evaluating Centrality -% Indices in Real-world Networks", Proc. The 35th International Conference -% on Parallel Processing (ICPP), Columbus, OH, August 2006. -% -% Ulrik Brandes, "A faster algorithm for betweenness centrality". Journal -% of Mathematical Sociology, 25(2):163177, 2001. -% -% L.C. Freeman, "A set of measures of centrality based on betweenness". -% Sociometry, 40(1):3541, 1977. -% -% -% INPUT -% -% G. - [struct] graph (from kernel 1). -% adjMatrix - sparse weighted adjacency matrix of the graph. -% K4approx - [int] binary exponent of the number of times that the -% algorithm is to loop, between 1 and SCALE. This -% simplification reduces its computational time from O(MN) -% to O(M*2^K4approx), which is important when testing large -% graphs. It determines the amount of work performed by -% kernel 4. When 'K4approx' equals 'SCALE', this -% implementation is exact. Otherwise, distinct vertices -% are selected randomly (user). -% batchSize - [int] the number of vertices to process at once. The space -% required by the algorithm increases linearly in this -% parameter. While there is no theoretical decrease in -% runtime by increasing this parameter, in actual -% implementations performance may increase due to batch -% processing of the operations. -% -% OUTPUT -% -% bc - [1D array, float] Betweenness centrality is a measure of -% the importance of a vertex with respect to the shortest -% paths between other vertices in the graph it lies on. bc is -% a list of the centralities that were computed (ordered by -% vertex number). -% -% -% REVISION -% 12-Oct-07 1.0 Release MIT Lincoln Laboratory. -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -% Allocate the data structures and initialize variables: -% Name Dimension Entries -% A : B^(N x N) M -% bfs : B^(batchSize x N x N) batchSize x N -% nsp : Z+^(batchSize x N) batchSize x N -% bcu : R^(batchSize x N) batchSize x N -% nspInv : R^(batchSize x N) batchSize x N -% w : R^(batchSize x N) batchSize x N -% fringe : Z+^(batchSize x N) < batchSize x N -% bc : R^(N) N -% batch : Z+^(batchSize) batchSize - -% Variable Description: -% A : The adjacency matrix. An entry at (x,y) indicates an edge -% coming from vertex x going to vertex y. Used only in its -% boolean form in this computation (unweighted). -% N : The number of vertices in the graph. -% batchSize : The number of vertices to process simultaneously. -% Increasing this number increases the amount of storage -% required by the algorithm, but may also increase the -% performance due to batch processing of the data. -% batch : The vertices in the current batch to be processed. -% bfs : The breadth-first search tree discovered. An entry at -% (x,y,z) indicates that for the root vertex batch(x), vertex -% y was discovered at depth z in the breadth-first search. -% nsp : The number of shortest paths. An entry (x,y)=m indicates -% that for root vertex batch(x), vertex y has m shortest -% paths to it. -% bcu : The centrality updates. An entry (x,y)=m indicates that -% root vertex batch(x) contributes m to the betweenness -% centrality for vertex y. -% nspInv : The inverse of the number of shortest paths. An entry -% (x,y)=m indicates that for root vertex batch(x), vertex y -% has 1/m shortest paths to it. -% w : The child weights during the centrality update. An entry -% (x,y)=m indicates that for root vertex batch(x), child -% vertex y applies a weight of m to all its parent vertices -% during the centrality update. -% fringe : The current open queue of the breadth-first search. When -% the depth is d in the breadth-first search, an entry -% (x,y)=m indicates that for root vertex batch(x), vertex y -% is at depth d and has m paths going to it. -% bc : The centrality scores. An entry (y)=m indicates that -% vertex y has a betweenness centrality score of m. - -% Convert the adjacency matrix to an unweighted graph, filter the edges -A = logical(G)'; - -% Get the number of vertices of the graph. -N = length(A); - -% Initialize the centrality -bc = zeros(1,N); - -% Fix any issues with the approximation and get the number of passes -if (2^K4approx > N) % Cannot perform more than N approximations - K4approx = floor(log2(N)); -end -nPasses = 2^K4approx; - -% Get the total number of batches -numBatches = ceil(nPasses/batchSize); - -for p = 1:numBatches - % Zero out the BFS - bfs = []; - - % Get the vertices in the current batch - batch = ((p-1).*batchSize + 1):min(p.*batchSize,N); - - % Get the size of the current batch - curSize = length(batch); - - % Set the number of paths to all root vertices to one - nsp = accumarray([(1:curSize)',batch'],1,[curSize,N]); - - % Set the counter for the depth in the BFS - depth = 0; - - % Set the initial fringe to be the neighbors of the root vertices - fringe = double(A(batch,:)); - - % While there are vertices in the fringe to iterate over - while nnz(fringe) > 0 - % Increment the depth - depth = depth + 1; - % Add in the shortest path counts from the fringe - nsp = nsp + fringe; - % Add in the vertices discovered from the fringe to the BFS - bfs(depth).G = logical(fringe); - % Compute the the next fringe - fringe = (fringe * A) .* not(nsp); - end - - % Free up memory - clear('fringe'); - - % Pre-compute 1/nsp - [rows cols vals] = find(nsp); - if(curSize==1) rows = rows'; cols = cols'; end - nspInv = accumarray([rows,cols],1./vals,[curSize,N]); - - % Free up memory - clear('rows','cols','vals'); - - % Pre-compute (1+bcUpdate) - bcu = ones(curSize,N); - - % Compute the bc update for all vertices except the sources - for depth = depth:-1:2 - % Compute the weights to be applied based on the child values - w = (bfs(depth).G .* nspInv) .* bcu; - % Apply the child value weights and sum them up over the parents - % Then apply the weights based on parent values - bcu = bcu + ((A * w')' .* bfs(depth-1).G) .* nsp; - end - - % Update the bc with the bc update - bc = bc + sum(bcu,1); - - % Free up memory - clear('w','nspInv','nsp','bcu','bfs'); -end - -% Subtract off the additional values added in by precomputation -bc = bc - nPasses; - - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Copyright 2007, Massachusetts Institute of Technology -% All rights reserved. -% -% Redistribution and use in source and binary forms, with or without -% modification, are permitted provided that the following conditions are -% met: -% * Redistributions of source code must retain the above copyright -% notice, this list of conditions and the following disclaimer. -% * Redistributions in binary form must reproduce the above copyright -% notice, this list of conditions and the following disclaimer in the -% documentation and/or other materials provided with the distribution. -% * Neither the name of the Massachusetts Institute of Technology nor -% the names of its contributors may be used to endorse or promote -% products derived from this software without specific prior written -% permission. -% -% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -% IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -% THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -% PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -% CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -% EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -% PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -% PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -% LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -% NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -% SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function bc = betwCentrality( G, K4approx, batchSize ) + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Function betwCentrality() - Kernel 4, analyze a graph's connectivity. +% +% The fourth computational kernel computes the betweenness centrality for +% an unweighted graph, using only matrix operations. Betweenness centrality +% is a measure of the importance of a vertex with respect to the shortest +% paths between other vertices in the graph that it lies on. This function +% computes an ordered list of centralities, each centrality corresponding +% to a specific vertex in the graph. +% +% The high computational cost of kernel 4: +% An exact implementation would consider all of the vertices as starting +% points in the betweenness centrality metric; this implementation +% can be 'dialed' to use a subset of starting vertices to obtain an +% approximation of the betweenness centrality. +% +% For a detailed description of the SCCA #2 graph analysis algorithm, +% please see SCCA #2 Graph Analysis Written Specification, V2.2. +% +% NOTES: +% +% This code is the vectorized version of the pseudo-code provided in the +% specification. It is designed to process a full level in the search tree +% at a time rather than just a single vertex. All of the operations are +% performed in he same way, only this code is able to perform them in +% parallel using sparse matrices and matrix operations. In addition, +% rather than processing a single vertex at a time, it has a configurable +% batch size parameter. While increasing the size of a batch increases the +% space required by the algorithm, it may also increase the performance. +% +% This uses Ulrik Brandes' Algorithm from "A faster algorithm for +% betweenness centrality", where variables are named in the following way: +% +% Ulrik Brandes This Code +% ----------------------------------------------------------------------- +% C_B bc +% P, d (unused) +% S, Q bfs +% s batch +% sigma nsp +% delta bcu +% +% S and Q can be stored using the same variable. This optimization can be +% performed in the original algorithm as well by simply using a vector for +% storage rather than a stack and a queue. Instead of discarding vertices +% from the top of Q, the vector pointer is advanced. The stack S +% corresponds to the vertices of the array in reverse order. +% +% bfs is stored as a matrix rather than a vector. Rather than looking at a +% single vertex at a time, all vertices at a particular depth are examined. +% +% D is not required. It was used previously to determine the +% distance between two vertices. In this implementation, this can be +% computed by looking at bfs. In addition, since all the nodes at a +% particular depth in the search are examined at the same time, all +% previously unseen vertices must be on shortest paths. +% +% P is computed rather than stored by selecting edges that go between +% vertices at neighboring depths. +% +% References: +% +% D.A. Bader and K. Madduri, "Parallel Algorithms for Evaluating Centrality +% Indices in Real-world Networks", Proc. The 35th International Conference +% on Parallel Processing (ICPP), Columbus, OH, August 2006. +% +% Ulrik Brandes, "A faster algorithm for betweenness centrality". Journal +% of Mathematical Sociology, 25(2):163177, 2001. +% +% L.C. Freeman, "A set of measures of centrality based on betweenness". +% Sociometry, 40(1):3541, 1977. +% +% +% INPUT +% +% G. - [struct] graph (from kernel 1). +% adjMatrix - sparse weighted adjacency matrix of the graph. +% K4approx - [int] binary exponent of the number of times that the +% algorithm is to loop, between 1 and SCALE. This +% simplification reduces its computational time from O(MN) +% to O(M*2^K4approx), which is important when testing large +% graphs. It determines the amount of work performed by +% kernel 4. When 'K4approx' equals 'SCALE', this +% implementation is exact. Otherwise, distinct vertices +% are selected randomly (user). +% batchSize - [int] the number of vertices to process at once. The space +% required by the algorithm increases linearly in this +% parameter. While there is no theoretical decrease in +% runtime by increasing this parameter, in actual +% implementations performance may increase due to batch +% processing of the operations. +% +% OUTPUT +% +% bc - [1D array, float] Betweenness centrality is a measure of +% the importance of a vertex with respect to the shortest +% paths between other vertices in the graph it lies on. bc is +% a list of the centralities that were computed (ordered by +% vertex number). +% +% +% REVISION +% 12-Oct-07 1.0 Release MIT Lincoln Laboratory. +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +% Allocate the data structures and initialize variables: +% Name Dimension Entries +% A : B^(N x N) M +% bfs : B^(batchSize x N x N) batchSize x N +% nsp : Z+^(batchSize x N) batchSize x N +% bcu : R^(batchSize x N) batchSize x N +% nspInv : R^(batchSize x N) batchSize x N +% w : R^(batchSize x N) batchSize x N +% fringe : Z+^(batchSize x N) < batchSize x N +% bc : R^(N) N +% batch : Z+^(batchSize) batchSize + +% Variable Description: +% A : The adjacency matrix. An entry at (x,y) indicates an edge +% coming from vertex x going to vertex y. Used only in its +% boolean form in this computation (unweighted). +% N : The number of vertices in the graph. +% batchSize : The number of vertices to process simultaneously. +% Increasing this number increases the amount of storage +% required by the algorithm, but may also increase the +% performance due to batch processing of the data. +% batch : The vertices in the current batch to be processed. +% bfs : The breadth-first search tree discovered. An entry at +% (x,y,z) indicates that for the root vertex batch(x), vertex +% y was discovered at depth z in the breadth-first search. +% nsp : The number of shortest paths. An entry (x,y)=m indicates +% that for root vertex batch(x), vertex y has m shortest +% paths to it. +% bcu : The centrality updates. An entry (x,y)=m indicates that +% root vertex batch(x) contributes m to the betweenness +% centrality for vertex y. +% nspInv : The inverse of the number of shortest paths. An entry +% (x,y)=m indicates that for root vertex batch(x), vertex y +% has 1/m shortest paths to it. +% w : The child weights during the centrality update. An entry +% (x,y)=m indicates that for root vertex batch(x), child +% vertex y applies a weight of m to all its parent vertices +% during the centrality update. +% fringe : The current open queue of the breadth-first search. When +% the depth is d in the breadth-first search, an entry +% (x,y)=m indicates that for root vertex batch(x), vertex y +% is at depth d and has m paths going to it. +% bc : The centrality scores. An entry (y)=m indicates that +% vertex y has a betweenness centrality score of m. + +% Convert the adjacency matrix to an unweighted graph, filter the edges +A = logical(G)'; + +% Get the number of vertices of the graph. +N = length(A); + +% Initialize the centrality +bc = zeros(1,N); + +% Fix any issues with the approximation and get the number of passes +if (2^K4approx > N) % Cannot perform more than N approximations + K4approx = floor(log2(N)); +end +nPasses = 2^K4approx; + +% Get the total number of batches +numBatches = ceil(nPasses/batchSize); + +for p = 1:numBatches + % Zero out the BFS + bfs = []; + + % Get the vertices in the current batch + batch = ((p-1).*batchSize + 1):min(p.*batchSize,N); + + % Get the size of the current batch + curSize = length(batch); + + % Set the number of paths to all root vertices to one + nsp = accumarray([(1:curSize)',batch'],1,[curSize,N]); + + % Set the counter for the depth in the BFS + depth = 0; + + % Set the initial fringe to be the neighbors of the root vertices + fringe = double(A(batch,:)); + + % While there are vertices in the fringe to iterate over + while nnz(fringe) > 0 + % Increment the depth + depth = depth + 1; + % Add in the shortest path counts from the fringe + nsp = nsp + fringe; + % Add in the vertices discovered from the fringe to the BFS + bfs(depth).G = logical(fringe); + % Compute the the next fringe + fringe = (fringe * A) .* not(nsp); + end + + % Free up memory + clear('fringe'); + + % Pre-compute 1/nsp + [rows cols vals] = find(nsp); + if(curSize==1) rows = rows'; cols = cols'; end + nspInv = accumarray([rows,cols],1./vals,[curSize,N]); + + % Free up memory + clear('rows','cols','vals'); + + % Pre-compute (1+bcUpdate) + bcu = ones(curSize,N); + + % Compute the bc update for all vertices except the sources + for depth = depth:-1:2 + % Compute the weights to be applied based on the child values + w = (bfs(depth).G .* nspInv) .* bcu; + % Apply the child value weights and sum them up over the parents + % Then apply the weights based on parent values + bcu = bcu + ((A * w')' .* bfs(depth-1).G) .* nsp; + end + + % Update the bc with the bc update + bc = bc + sum(bcu,1); + + % Free up memory + clear('w','nspInv','nsp','bcu','bfs'); +end + +% Subtract off the additional values added in by precomputation +bc = bc - nPasses; + + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Copyright 2007, Massachusetts Institute of Technology +% All rights reserved. +% +% Redistribution and use in source and binary forms, with or without +% modification, are permitted provided that the following conditions are +% met: +% * Redistributions of source code must retain the above copyright +% notice, this list of conditions and the following disclaimer. +% * Redistributions in binary form must reproduce the above copyright +% notice, this list of conditions and the following disclaimer in the +% documentation and/or other materials provided with the distribution. +% * Neither the name of the Massachusetts Institute of Technology nor +% the names of its contributors may be used to endorse or promote +% products derived from this software without specific prior written +% permission. +% +% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +% IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +% THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +% PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +% CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +% EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +% PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +% PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +% LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +% NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +% SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/mfiles/compress.pl b/mfiles/compress.pl index 3d2f3bd1..8d24af1b 100755 --- a/mfiles/compress.pl +++ b/mfiles/compress.pl @@ -1,28 +1,28 @@ -#!/bin/perl -# -# Script to compress all inputs -# -$matrixname="betwinput_scale"; -$transname="betwinput_transposed_scale"; -$outname="bc_scale"; -for($scale=27; $scale<=27; $scale++) -{ - $strtar = "tar -cvf ${outname}$scale.tar ${matrixname}$scale ${transname}$scale\n"; - print $strtar; - system($strtar); - - $strzip = "pbzip2 -p8 -k -r ${outname}$scale.tar\n"; - print $strzip; - system($strzip); - - $strsizetar = "ls -alh ${outname}$scale.tar\n"; - $strsizezip = "ls -alh ${outname}$scale.tar.bz2\n"; - print $strsizetar; - print $strsizezip; - system($strsizetar); - system($strsizezip); - - $strdel = "rm ${matrixname}$scale ${transname}$scale ${outname}$scale.tar\n"; - print $strdel; - system($strdel); -} +#!/bin/perl +# +# Script to compress all inputs +# +$matrixname="betwinput_scale"; +$transname="betwinput_transposed_scale"; +$outname="bc_scale"; +for($scale=27; $scale<=27; $scale++) +{ + $strtar = "tar -cvf ${outname}$scale.tar ${matrixname}$scale ${transname}$scale\n"; + print $strtar; + system($strtar); + + $strzip = "pbzip2 -p8 -k -r ${outname}$scale.tar\n"; + print $strzip; + system($strzip); + + $strsizetar = "ls -alh ${outname}$scale.tar\n"; + $strsizezip = "ls -alh ${outname}$scale.tar.bz2\n"; + print $strsizetar; + print $strsizezip; + system($strsizetar); + system($strsizezip); + + $strdel = "rm ${matrixname}$scale ${transname}$scale ${outname}$scale.tar\n"; + print $strdel; + system($strdel); +} diff --git a/ms_inttypes/inttypes.h b/ms_inttypes/inttypes.h index 25542771..4b3828a2 100644 --- a/ms_inttypes/inttypes.h +++ b/ms_inttypes/inttypes.h @@ -1,305 +1,305 @@ -// ISO C9x compliant inttypes.h for Microsoft Visual Studio -// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 -// -// Copyright (c) 2006 Alexander Chemeris -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. The name of the author may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -/////////////////////////////////////////////////////////////////////////////// - -#ifndef _MSC_VER // [ -#error "Use this header only with Microsoft Visual C++ compilers!" -#endif // _MSC_VER ] - -#ifndef _MSC_INTTYPES_H_ // [ -#define _MSC_INTTYPES_H_ - -#if _MSC_VER > 1000 -#pragma once -#endif - -#include "stdint.h" - -// 7.8 Format conversion of integer types - -typedef struct { - intmax_t quot; - intmax_t rem; -} imaxdiv_t; - -// 7.8.1 Macros for format specifiers - -#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198 - -// The fprintf macros for signed integers are: -#define PRId8 "d" -#define PRIi8 "i" -#define PRIdLEAST8 "d" -#define PRIiLEAST8 "i" -#define PRIdFAST8 "d" -#define PRIiFAST8 "i" - -#define PRId16 "hd" -#define PRIi16 "hi" -#define PRIdLEAST16 "hd" -#define PRIiLEAST16 "hi" -#define PRIdFAST16 "hd" -#define PRIiFAST16 "hi" - -#define PRId32 "I32d" -#define PRIi32 "I32i" -#define PRIdLEAST32 "I32d" -#define PRIiLEAST32 "I32i" -#define PRIdFAST32 "I32d" -#define PRIiFAST32 "I32i" - -#define PRId64 "I64d" -#define PRIi64 "I64i" -#define PRIdLEAST64 "I64d" -#define PRIiLEAST64 "I64i" -#define PRIdFAST64 "I64d" -#define PRIiFAST64 "I64i" - -#define PRIdMAX "I64d" -#define PRIiMAX "I64i" - -#define PRIdPTR "Id" -#define PRIiPTR "Ii" - -// The fprintf macros for unsigned integers are: -#define PRIo8 "o" -#define PRIu8 "u" -#define PRIx8 "x" -#define PRIX8 "X" -#define PRIoLEAST8 "o" -#define PRIuLEAST8 "u" -#define PRIxLEAST8 "x" -#define PRIXLEAST8 "X" -#define PRIoFAST8 "o" -#define PRIuFAST8 "u" -#define PRIxFAST8 "x" -#define PRIXFAST8 "X" - -#define PRIo16 "ho" -#define PRIu16 "hu" -#define PRIx16 "hx" -#define PRIX16 "hX" -#define PRIoLEAST16 "ho" -#define PRIuLEAST16 "hu" -#define PRIxLEAST16 "hx" -#define PRIXLEAST16 "hX" -#define PRIoFAST16 "ho" -#define PRIuFAST16 "hu" -#define PRIxFAST16 "hx" -#define PRIXFAST16 "hX" - -#define PRIo32 "I32o" -#define PRIu32 "I32u" -#define PRIx32 "I32x" -#define PRIX32 "I32X" -#define PRIoLEAST32 "I32o" -#define PRIuLEAST32 "I32u" -#define PRIxLEAST32 "I32x" -#define PRIXLEAST32 "I32X" -#define PRIoFAST32 "I32o" -#define PRIuFAST32 "I32u" -#define PRIxFAST32 "I32x" -#define PRIXFAST32 "I32X" - -#define PRIo64 "I64o" -#define PRIu64 "I64u" -#define PRIx64 "I64x" -#define PRIX64 "I64X" -#define PRIoLEAST64 "I64o" -#define PRIuLEAST64 "I64u" -#define PRIxLEAST64 "I64x" -#define PRIXLEAST64 "I64X" -#define PRIoFAST64 "I64o" -#define PRIuFAST64 "I64u" -#define PRIxFAST64 "I64x" -#define PRIXFAST64 "I64X" - -#define PRIoMAX "I64o" -#define PRIuMAX "I64u" -#define PRIxMAX "I64x" -#define PRIXMAX "I64X" - -#define PRIoPTR "Io" -#define PRIuPTR "Iu" -#define PRIxPTR "Ix" -#define PRIXPTR "IX" - -// The fscanf macros for signed integers are: -#define SCNd8 "d" -#define SCNi8 "i" -#define SCNdLEAST8 "d" -#define SCNiLEAST8 "i" -#define SCNdFAST8 "d" -#define SCNiFAST8 "i" - -#define SCNd16 "hd" -#define SCNi16 "hi" -#define SCNdLEAST16 "hd" -#define SCNiLEAST16 "hi" -#define SCNdFAST16 "hd" -#define SCNiFAST16 "hi" - -#define SCNd32 "ld" -#define SCNi32 "li" -#define SCNdLEAST32 "ld" -#define SCNiLEAST32 "li" -#define SCNdFAST32 "ld" -#define SCNiFAST32 "li" - -#define SCNd64 "I64d" -#define SCNi64 "I64i" -#define SCNdLEAST64 "I64d" -#define SCNiLEAST64 "I64i" -#define SCNdFAST64 "I64d" -#define SCNiFAST64 "I64i" - -#define SCNdMAX "I64d" -#define SCNiMAX "I64i" - -#ifdef _WIN64 // [ -# define SCNdPTR "I64d" -# define SCNiPTR "I64i" -#else // _WIN64 ][ -# define SCNdPTR "ld" -# define SCNiPTR "li" -#endif // _WIN64 ] - -// The fscanf macros for unsigned integers are: -#define SCNo8 "o" -#define SCNu8 "u" -#define SCNx8 "x" -#define SCNX8 "X" -#define SCNoLEAST8 "o" -#define SCNuLEAST8 "u" -#define SCNxLEAST8 "x" -#define SCNXLEAST8 "X" -#define SCNoFAST8 "o" -#define SCNuFAST8 "u" -#define SCNxFAST8 "x" -#define SCNXFAST8 "X" - -#define SCNo16 "ho" -#define SCNu16 "hu" -#define SCNx16 "hx" -#define SCNX16 "hX" -#define SCNoLEAST16 "ho" -#define SCNuLEAST16 "hu" -#define SCNxLEAST16 "hx" -#define SCNXLEAST16 "hX" -#define SCNoFAST16 "ho" -#define SCNuFAST16 "hu" -#define SCNxFAST16 "hx" -#define SCNXFAST16 "hX" - -#define SCNo32 "lo" -#define SCNu32 "lu" -#define SCNx32 "lx" -#define SCNX32 "lX" -#define SCNoLEAST32 "lo" -#define SCNuLEAST32 "lu" -#define SCNxLEAST32 "lx" -#define SCNXLEAST32 "lX" -#define SCNoFAST32 "lo" -#define SCNuFAST32 "lu" -#define SCNxFAST32 "lx" -#define SCNXFAST32 "lX" - -#define SCNo64 "I64o" -#define SCNu64 "I64u" -#define SCNx64 "I64x" -#define SCNX64 "I64X" -#define SCNoLEAST64 "I64o" -#define SCNuLEAST64 "I64u" -#define SCNxLEAST64 "I64x" -#define SCNXLEAST64 "I64X" -#define SCNoFAST64 "I64o" -#define SCNuFAST64 "I64u" -#define SCNxFAST64 "I64x" -#define SCNXFAST64 "I64X" - -#define SCNoMAX "I64o" -#define SCNuMAX "I64u" -#define SCNxMAX "I64x" -#define SCNXMAX "I64X" - -#ifdef _WIN64 // [ -# define SCNoPTR "I64o" -# define SCNuPTR "I64u" -# define SCNxPTR "I64x" -# define SCNXPTR "I64X" -#else // _WIN64 ][ -# define SCNoPTR "lo" -# define SCNuPTR "lu" -# define SCNxPTR "lx" -# define SCNXPTR "lX" -#endif // _WIN64 ] - -#endif // __STDC_FORMAT_MACROS ] - -// 7.8.2 Functions for greatest-width integer types - -// 7.8.2.1 The imaxabs function -#define imaxabs _abs64 - -// 7.8.2.2 The imaxdiv function - -// This is modified version of div() function from Microsoft's div.c found -// in %MSVC.NET%\crt\src\div.c -#ifdef STATIC_IMAXDIV // [ -static -#else // STATIC_IMAXDIV ][ -_inline -#endif // STATIC_IMAXDIV ] -imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) -{ - imaxdiv_t result; - - result.quot = numer / denom; - result.rem = numer % denom; - - if (numer < 0 && result.rem > 0) { - // did division wrong; must fix up - ++result.quot; - result.rem -= denom; - } - - return result; -} - -// 7.8.2.3 The strtoimax and strtoumax functions -#define strtoimax _strtoi64 -#define strtoumax _strtoui64 - -// 7.8.2.4 The wcstoimax and wcstoumax functions -#define wcstoimax _wcstoi64 -#define wcstoumax _wcstoui64 - - -#endif // _MSC_INTTYPES_H_ ] +// ISO C9x compliant inttypes.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_INTTYPES_H_ // [ +#define _MSC_INTTYPES_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include "stdint.h" + +// 7.8 Format conversion of integer types + +typedef struct { + intmax_t quot; + intmax_t rem; +} imaxdiv_t; + +// 7.8.1 Macros for format specifiers + +#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198 + +// The fprintf macros for signed integers are: +#define PRId8 "d" +#define PRIi8 "i" +#define PRIdLEAST8 "d" +#define PRIiLEAST8 "i" +#define PRIdFAST8 "d" +#define PRIiFAST8 "i" + +#define PRId16 "hd" +#define PRIi16 "hi" +#define PRIdLEAST16 "hd" +#define PRIiLEAST16 "hi" +#define PRIdFAST16 "hd" +#define PRIiFAST16 "hi" + +#define PRId32 "I32d" +#define PRIi32 "I32i" +#define PRIdLEAST32 "I32d" +#define PRIiLEAST32 "I32i" +#define PRIdFAST32 "I32d" +#define PRIiFAST32 "I32i" + +#define PRId64 "I64d" +#define PRIi64 "I64i" +#define PRIdLEAST64 "I64d" +#define PRIiLEAST64 "I64i" +#define PRIdFAST64 "I64d" +#define PRIiFAST64 "I64i" + +#define PRIdMAX "I64d" +#define PRIiMAX "I64i" + +#define PRIdPTR "Id" +#define PRIiPTR "Ii" + +// The fprintf macros for unsigned integers are: +#define PRIo8 "o" +#define PRIu8 "u" +#define PRIx8 "x" +#define PRIX8 "X" +#define PRIoLEAST8 "o" +#define PRIuLEAST8 "u" +#define PRIxLEAST8 "x" +#define PRIXLEAST8 "X" +#define PRIoFAST8 "o" +#define PRIuFAST8 "u" +#define PRIxFAST8 "x" +#define PRIXFAST8 "X" + +#define PRIo16 "ho" +#define PRIu16 "hu" +#define PRIx16 "hx" +#define PRIX16 "hX" +#define PRIoLEAST16 "ho" +#define PRIuLEAST16 "hu" +#define PRIxLEAST16 "hx" +#define PRIXLEAST16 "hX" +#define PRIoFAST16 "ho" +#define PRIuFAST16 "hu" +#define PRIxFAST16 "hx" +#define PRIXFAST16 "hX" + +#define PRIo32 "I32o" +#define PRIu32 "I32u" +#define PRIx32 "I32x" +#define PRIX32 "I32X" +#define PRIoLEAST32 "I32o" +#define PRIuLEAST32 "I32u" +#define PRIxLEAST32 "I32x" +#define PRIXLEAST32 "I32X" +#define PRIoFAST32 "I32o" +#define PRIuFAST32 "I32u" +#define PRIxFAST32 "I32x" +#define PRIXFAST32 "I32X" + +#define PRIo64 "I64o" +#define PRIu64 "I64u" +#define PRIx64 "I64x" +#define PRIX64 "I64X" +#define PRIoLEAST64 "I64o" +#define PRIuLEAST64 "I64u" +#define PRIxLEAST64 "I64x" +#define PRIXLEAST64 "I64X" +#define PRIoFAST64 "I64o" +#define PRIuFAST64 "I64u" +#define PRIxFAST64 "I64x" +#define PRIXFAST64 "I64X" + +#define PRIoMAX "I64o" +#define PRIuMAX "I64u" +#define PRIxMAX "I64x" +#define PRIXMAX "I64X" + +#define PRIoPTR "Io" +#define PRIuPTR "Iu" +#define PRIxPTR "Ix" +#define PRIXPTR "IX" + +// The fscanf macros for signed integers are: +#define SCNd8 "d" +#define SCNi8 "i" +#define SCNdLEAST8 "d" +#define SCNiLEAST8 "i" +#define SCNdFAST8 "d" +#define SCNiFAST8 "i" + +#define SCNd16 "hd" +#define SCNi16 "hi" +#define SCNdLEAST16 "hd" +#define SCNiLEAST16 "hi" +#define SCNdFAST16 "hd" +#define SCNiFAST16 "hi" + +#define SCNd32 "ld" +#define SCNi32 "li" +#define SCNdLEAST32 "ld" +#define SCNiLEAST32 "li" +#define SCNdFAST32 "ld" +#define SCNiFAST32 "li" + +#define SCNd64 "I64d" +#define SCNi64 "I64i" +#define SCNdLEAST64 "I64d" +#define SCNiLEAST64 "I64i" +#define SCNdFAST64 "I64d" +#define SCNiFAST64 "I64i" + +#define SCNdMAX "I64d" +#define SCNiMAX "I64i" + +#ifdef _WIN64 // [ +# define SCNdPTR "I64d" +# define SCNiPTR "I64i" +#else // _WIN64 ][ +# define SCNdPTR "ld" +# define SCNiPTR "li" +#endif // _WIN64 ] + +// The fscanf macros for unsigned integers are: +#define SCNo8 "o" +#define SCNu8 "u" +#define SCNx8 "x" +#define SCNX8 "X" +#define SCNoLEAST8 "o" +#define SCNuLEAST8 "u" +#define SCNxLEAST8 "x" +#define SCNXLEAST8 "X" +#define SCNoFAST8 "o" +#define SCNuFAST8 "u" +#define SCNxFAST8 "x" +#define SCNXFAST8 "X" + +#define SCNo16 "ho" +#define SCNu16 "hu" +#define SCNx16 "hx" +#define SCNX16 "hX" +#define SCNoLEAST16 "ho" +#define SCNuLEAST16 "hu" +#define SCNxLEAST16 "hx" +#define SCNXLEAST16 "hX" +#define SCNoFAST16 "ho" +#define SCNuFAST16 "hu" +#define SCNxFAST16 "hx" +#define SCNXFAST16 "hX" + +#define SCNo32 "lo" +#define SCNu32 "lu" +#define SCNx32 "lx" +#define SCNX32 "lX" +#define SCNoLEAST32 "lo" +#define SCNuLEAST32 "lu" +#define SCNxLEAST32 "lx" +#define SCNXLEAST32 "lX" +#define SCNoFAST32 "lo" +#define SCNuFAST32 "lu" +#define SCNxFAST32 "lx" +#define SCNXFAST32 "lX" + +#define SCNo64 "I64o" +#define SCNu64 "I64u" +#define SCNx64 "I64x" +#define SCNX64 "I64X" +#define SCNoLEAST64 "I64o" +#define SCNuLEAST64 "I64u" +#define SCNxLEAST64 "I64x" +#define SCNXLEAST64 "I64X" +#define SCNoFAST64 "I64o" +#define SCNuFAST64 "I64u" +#define SCNxFAST64 "I64x" +#define SCNXFAST64 "I64X" + +#define SCNoMAX "I64o" +#define SCNuMAX "I64u" +#define SCNxMAX "I64x" +#define SCNXMAX "I64X" + +#ifdef _WIN64 // [ +# define SCNoPTR "I64o" +# define SCNuPTR "I64u" +# define SCNxPTR "I64x" +# define SCNXPTR "I64X" +#else // _WIN64 ][ +# define SCNoPTR "lo" +# define SCNuPTR "lu" +# define SCNxPTR "lx" +# define SCNXPTR "lX" +#endif // _WIN64 ] + +#endif // __STDC_FORMAT_MACROS ] + +// 7.8.2 Functions for greatest-width integer types + +// 7.8.2.1 The imaxabs function +#define imaxabs _abs64 + +// 7.8.2.2 The imaxdiv function + +// This is modified version of div() function from Microsoft's div.c found +// in %MSVC.NET%\crt\src\div.c +#ifdef STATIC_IMAXDIV // [ +static +#else // STATIC_IMAXDIV ][ +_inline +#endif // STATIC_IMAXDIV ] +imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) +{ + imaxdiv_t result; + + result.quot = numer / denom; + result.rem = numer % denom; + + if (numer < 0 && result.rem > 0) { + // did division wrong; must fix up + ++result.quot; + result.rem -= denom; + } + + return result; +} + +// 7.8.2.3 The strtoimax and strtoumax functions +#define strtoimax _strtoi64 +#define strtoumax _strtoui64 + +// 7.8.2.4 The wcstoimax and wcstoumax functions +#define wcstoimax _wcstoi64 +#define wcstoumax _wcstoui64 + + +#endif // _MSC_INTTYPES_H_ ] diff --git a/ms_inttypes/stdint.h b/ms_inttypes/stdint.h index 59d06730..d02608a5 100644 --- a/ms_inttypes/stdint.h +++ b/ms_inttypes/stdint.h @@ -1,247 +1,247 @@ -// ISO C9x compliant stdint.h for Microsoft Visual Studio -// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 -// -// Copyright (c) 2006-2008 Alexander Chemeris -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. The name of the author may be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -/////////////////////////////////////////////////////////////////////////////// - -#ifndef _MSC_VER // [ -#error "Use this header only with Microsoft Visual C++ compilers!" -#endif // _MSC_VER ] - -#ifndef _MSC_STDINT_H_ // [ -#define _MSC_STDINT_H_ - -#if _MSC_VER > 1000 -#pragma once -#endif - -#include - -// For Visual Studio 6 in C++ mode and for many Visual Studio versions when -// compiling for ARM we should wrap include with 'extern "C++" {}' -// or compiler give many errors like this: -// error C2733: second C linkage of overloaded function 'wmemchr' not allowed -#ifdef __cplusplus -extern "C" { -#endif -# include -#ifdef __cplusplus -} -#endif - -// Define _W64 macros to mark types changing their size, like intptr_t. -#ifndef _W64 -# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 -# define _W64 __w64 -# else -# define _W64 -# endif -#endif - - -// 7.18.1 Integer types - -// 7.18.1.1 Exact-width integer types - -// Visual Studio 6 and Embedded Visual C++ 4 doesn't -// realize that, e.g. char has the same size as __int8 -// so we give up on __intX for them. -#if (_MSC_VER < 1300) - typedef signed char int8_t; - typedef signed short int16_t; - typedef signed int int32_t; - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; -#else - typedef signed __int8 int8_t; - typedef signed __int16 int16_t; - typedef signed __int32 int32_t; - typedef unsigned __int8 uint8_t; - typedef unsigned __int16 uint16_t; - typedef unsigned __int32 uint32_t; -#endif -typedef signed __int64 int64_t; -typedef unsigned __int64 uint64_t; - - -// 7.18.1.2 Minimum-width integer types -typedef int8_t int_least8_t; -typedef int16_t int_least16_t; -typedef int32_t int_least32_t; -typedef int64_t int_least64_t; -typedef uint8_t uint_least8_t; -typedef uint16_t uint_least16_t; -typedef uint32_t uint_least32_t; -typedef uint64_t uint_least64_t; - -// 7.18.1.3 Fastest minimum-width integer types -typedef int8_t int_fast8_t; -typedef int16_t int_fast16_t; -typedef int32_t int_fast32_t; -typedef int64_t int_fast64_t; -typedef uint8_t uint_fast8_t; -typedef uint16_t uint_fast16_t; -typedef uint32_t uint_fast32_t; -typedef uint64_t uint_fast64_t; - -// 7.18.1.4 Integer types capable of holding object pointers -#ifdef _WIN64 // [ - typedef signed __int64 intptr_t; - typedef unsigned __int64 uintptr_t; -#else // _WIN64 ][ - typedef _W64 signed int intptr_t; - typedef _W64 unsigned int uintptr_t; -#endif // _WIN64 ] - -// 7.18.1.5 Greatest-width integer types -typedef int64_t intmax_t; -typedef uint64_t uintmax_t; - - -// 7.18.2 Limits of specified-width integer types - -#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 - -// 7.18.2.1 Limits of exact-width integer types -#define INT8_MIN ((int8_t)_I8_MIN) -#define INT8_MAX _I8_MAX -#define INT16_MIN ((int16_t)_I16_MIN) -#define INT16_MAX _I16_MAX -#define INT32_MIN ((int32_t)_I32_MIN) -#define INT32_MAX _I32_MAX -#define INT64_MIN ((int64_t)_I64_MIN) -#define INT64_MAX _I64_MAX -#define UINT8_MAX _UI8_MAX -#define UINT16_MAX _UI16_MAX -#define UINT32_MAX _UI32_MAX -#define UINT64_MAX _UI64_MAX - -// 7.18.2.2 Limits of minimum-width integer types -#define INT_LEAST8_MIN INT8_MIN -#define INT_LEAST8_MAX INT8_MAX -#define INT_LEAST16_MIN INT16_MIN -#define INT_LEAST16_MAX INT16_MAX -#define INT_LEAST32_MIN INT32_MIN -#define INT_LEAST32_MAX INT32_MAX -#define INT_LEAST64_MIN INT64_MIN -#define INT_LEAST64_MAX INT64_MAX -#define UINT_LEAST8_MAX UINT8_MAX -#define UINT_LEAST16_MAX UINT16_MAX -#define UINT_LEAST32_MAX UINT32_MAX -#define UINT_LEAST64_MAX UINT64_MAX - -// 7.18.2.3 Limits of fastest minimum-width integer types -#define INT_FAST8_MIN INT8_MIN -#define INT_FAST8_MAX INT8_MAX -#define INT_FAST16_MIN INT16_MIN -#define INT_FAST16_MAX INT16_MAX -#define INT_FAST32_MIN INT32_MIN -#define INT_FAST32_MAX INT32_MAX -#define INT_FAST64_MIN INT64_MIN -#define INT_FAST64_MAX INT64_MAX -#define UINT_FAST8_MAX UINT8_MAX -#define UINT_FAST16_MAX UINT16_MAX -#define UINT_FAST32_MAX UINT32_MAX -#define UINT_FAST64_MAX UINT64_MAX - -// 7.18.2.4 Limits of integer types capable of holding object pointers -#ifdef _WIN64 // [ -# define INTPTR_MIN INT64_MIN -# define INTPTR_MAX INT64_MAX -# define UINTPTR_MAX UINT64_MAX -#else // _WIN64 ][ -# define INTPTR_MIN INT32_MIN -# define INTPTR_MAX INT32_MAX -# define UINTPTR_MAX UINT32_MAX -#endif // _WIN64 ] - -// 7.18.2.5 Limits of greatest-width integer types -#define INTMAX_MIN INT64_MIN -#define INTMAX_MAX INT64_MAX -#define UINTMAX_MAX UINT64_MAX - -// 7.18.3 Limits of other integer types - -#ifdef _WIN64 // [ -# define PTRDIFF_MIN _I64_MIN -# define PTRDIFF_MAX _I64_MAX -#else // _WIN64 ][ -# define PTRDIFF_MIN _I32_MIN -# define PTRDIFF_MAX _I32_MAX -#endif // _WIN64 ] - -#define SIG_ATOMIC_MIN INT_MIN -#define SIG_ATOMIC_MAX INT_MAX - -#ifndef SIZE_MAX // [ -# ifdef _WIN64 // [ -# define SIZE_MAX _UI64_MAX -# else // _WIN64 ][ -# define SIZE_MAX _UI32_MAX -# endif // _WIN64 ] -#endif // SIZE_MAX ] - -// WCHAR_MIN and WCHAR_MAX are also defined in -#ifndef WCHAR_MIN // [ -# define WCHAR_MIN 0 -#endif // WCHAR_MIN ] -#ifndef WCHAR_MAX // [ -# define WCHAR_MAX _UI16_MAX -#endif // WCHAR_MAX ] - -#define WINT_MIN 0 -#define WINT_MAX _UI16_MAX - -#endif // __STDC_LIMIT_MACROS ] - - -// 7.18.4 Limits of other integer types - -#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 - -// 7.18.4.1 Macros for minimum-width integer constants - -#define INT8_C(val) val##i8 -#define INT16_C(val) val##i16 -#define INT32_C(val) val##i32 -#define INT64_C(val) val##i64 - -#define UINT8_C(val) val##ui8 -#define UINT16_C(val) val##ui16 -#define UINT32_C(val) val##ui32 -#define UINT64_C(val) val##ui64 - -// 7.18.4.2 Macros for greatest-width integer constants -#define INTMAX_C INT64_C -#define UINTMAX_C UINT64_C - -#endif // __STDC_CONSTANT_MACROS ] - - -#endif // _MSC_STDINT_H_ ] +// ISO C9x compliant stdint.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006-2008 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_STDINT_H_ // [ +#define _MSC_STDINT_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include + +// For Visual Studio 6 in C++ mode and for many Visual Studio versions when +// compiling for ARM we should wrap include with 'extern "C++" {}' +// or compiler give many errors like this: +// error C2733: second C linkage of overloaded function 'wmemchr' not allowed +#ifdef __cplusplus +extern "C" { +#endif +# include +#ifdef __cplusplus +} +#endif + +// Define _W64 macros to mark types changing their size, like intptr_t. +#ifndef _W64 +# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 +# define _W64 __w64 +# else +# define _W64 +# endif +#endif + + +// 7.18.1 Integer types + +// 7.18.1.1 Exact-width integer types + +// Visual Studio 6 and Embedded Visual C++ 4 doesn't +// realize that, e.g. char has the same size as __int8 +// so we give up on __intX for them. +#if (_MSC_VER < 1300) + typedef signed char int8_t; + typedef signed short int16_t; + typedef signed int int32_t; + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; +#else + typedef signed __int8 int8_t; + typedef signed __int16 int16_t; + typedef signed __int32 int32_t; + typedef unsigned __int8 uint8_t; + typedef unsigned __int16 uint16_t; + typedef unsigned __int32 uint32_t; +#endif +typedef signed __int64 int64_t; +typedef unsigned __int64 uint64_t; + + +// 7.18.1.2 Minimum-width integer types +typedef int8_t int_least8_t; +typedef int16_t int_least16_t; +typedef int32_t int_least32_t; +typedef int64_t int_least64_t; +typedef uint8_t uint_least8_t; +typedef uint16_t uint_least16_t; +typedef uint32_t uint_least32_t; +typedef uint64_t uint_least64_t; + +// 7.18.1.3 Fastest minimum-width integer types +typedef int8_t int_fast8_t; +typedef int16_t int_fast16_t; +typedef int32_t int_fast32_t; +typedef int64_t int_fast64_t; +typedef uint8_t uint_fast8_t; +typedef uint16_t uint_fast16_t; +typedef uint32_t uint_fast32_t; +typedef uint64_t uint_fast64_t; + +// 7.18.1.4 Integer types capable of holding object pointers +#ifdef _WIN64 // [ + typedef signed __int64 intptr_t; + typedef unsigned __int64 uintptr_t; +#else // _WIN64 ][ + typedef _W64 signed int intptr_t; + typedef _W64 unsigned int uintptr_t; +#endif // _WIN64 ] + +// 7.18.1.5 Greatest-width integer types +typedef int64_t intmax_t; +typedef uint64_t uintmax_t; + + +// 7.18.2 Limits of specified-width integer types + +#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 + +// 7.18.2.1 Limits of exact-width integer types +#define INT8_MIN ((int8_t)_I8_MIN) +#define INT8_MAX _I8_MAX +#define INT16_MIN ((int16_t)_I16_MIN) +#define INT16_MAX _I16_MAX +#define INT32_MIN ((int32_t)_I32_MIN) +#define INT32_MAX _I32_MAX +#define INT64_MIN ((int64_t)_I64_MIN) +#define INT64_MAX _I64_MAX +#define UINT8_MAX _UI8_MAX +#define UINT16_MAX _UI16_MAX +#define UINT32_MAX _UI32_MAX +#define UINT64_MAX _UI64_MAX + +// 7.18.2.2 Limits of minimum-width integer types +#define INT_LEAST8_MIN INT8_MIN +#define INT_LEAST8_MAX INT8_MAX +#define INT_LEAST16_MIN INT16_MIN +#define INT_LEAST16_MAX INT16_MAX +#define INT_LEAST32_MIN INT32_MIN +#define INT_LEAST32_MAX INT32_MAX +#define INT_LEAST64_MIN INT64_MIN +#define INT_LEAST64_MAX INT64_MAX +#define UINT_LEAST8_MAX UINT8_MAX +#define UINT_LEAST16_MAX UINT16_MAX +#define UINT_LEAST32_MAX UINT32_MAX +#define UINT_LEAST64_MAX UINT64_MAX + +// 7.18.2.3 Limits of fastest minimum-width integer types +#define INT_FAST8_MIN INT8_MIN +#define INT_FAST8_MAX INT8_MAX +#define INT_FAST16_MIN INT16_MIN +#define INT_FAST16_MAX INT16_MAX +#define INT_FAST32_MIN INT32_MIN +#define INT_FAST32_MAX INT32_MAX +#define INT_FAST64_MIN INT64_MIN +#define INT_FAST64_MAX INT64_MAX +#define UINT_FAST8_MAX UINT8_MAX +#define UINT_FAST16_MAX UINT16_MAX +#define UINT_FAST32_MAX UINT32_MAX +#define UINT_FAST64_MAX UINT64_MAX + +// 7.18.2.4 Limits of integer types capable of holding object pointers +#ifdef _WIN64 // [ +# define INTPTR_MIN INT64_MIN +# define INTPTR_MAX INT64_MAX +# define UINTPTR_MAX UINT64_MAX +#else // _WIN64 ][ +# define INTPTR_MIN INT32_MIN +# define INTPTR_MAX INT32_MAX +# define UINTPTR_MAX UINT32_MAX +#endif // _WIN64 ] + +// 7.18.2.5 Limits of greatest-width integer types +#define INTMAX_MIN INT64_MIN +#define INTMAX_MAX INT64_MAX +#define UINTMAX_MAX UINT64_MAX + +// 7.18.3 Limits of other integer types + +#ifdef _WIN64 // [ +# define PTRDIFF_MIN _I64_MIN +# define PTRDIFF_MAX _I64_MAX +#else // _WIN64 ][ +# define PTRDIFF_MIN _I32_MIN +# define PTRDIFF_MAX _I32_MAX +#endif // _WIN64 ] + +#define SIG_ATOMIC_MIN INT_MIN +#define SIG_ATOMIC_MAX INT_MAX + +#ifndef SIZE_MAX // [ +# ifdef _WIN64 // [ +# define SIZE_MAX _UI64_MAX +# else // _WIN64 ][ +# define SIZE_MAX _UI32_MAX +# endif // _WIN64 ] +#endif // SIZE_MAX ] + +// WCHAR_MIN and WCHAR_MAX are also defined in +#ifndef WCHAR_MIN // [ +# define WCHAR_MIN 0 +#endif // WCHAR_MIN ] +#ifndef WCHAR_MAX // [ +# define WCHAR_MAX _UI16_MAX +#endif // WCHAR_MAX ] + +#define WINT_MIN 0 +#define WINT_MAX _UI16_MAX + +#endif // __STDC_LIMIT_MACROS ] + + +// 7.18.4 Limits of other integer types + +#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 + +// 7.18.4.1 Macros for minimum-width integer constants + +#define INT8_C(val) val##i8 +#define INT16_C(val) val##i16 +#define INT32_C(val) val##i32 +#define INT64_C(val) val##i64 + +#define UINT8_C(val) val##ui8 +#define UINT16_C(val) val##ui16 +#define UINT32_C(val) val##ui32 +#define UINT64_C(val) val##ui64 + +// 7.18.4.2 Macros for greatest-width integer constants +#define INTMAX_C INT64_C +#define UINTMAX_C UINT64_C + +#endif // __STDC_CONSTANT_MACROS ] + + +#endif // _MSC_STDINT_H_ ]