diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh index 7f276a2a..429966f6 100644 --- a/lib/THC/THCReduce.cuh +++ b/lib/THC/THCReduce.cuh @@ -198,7 +198,7 @@ bool THC_reduceDim(THCState* state, } block = getContigReduceBlock(outElements, reductionSize); - smemSize = sizeof(typename TensorUtils::DataType) * block.x; + smemSize = reduceSmemSize::DataType, 1>(state, block.x); } else { if (!getNoncontigReduceGrid(outElements, grid)) { return false; diff --git a/lib/THC/THCReduceAll.cuh b/lib/THC/THCReduceAll.cuh index 1d04e63b..73802e60 100644 --- a/lib/THC/THCReduceAll.cuh +++ b/lib/THC/THCReduceAll.cuh @@ -162,14 +162,14 @@ inline void getPass2ReduceBlockGrid(THCState* state, ptrdiff_t elements, dim3& grid, dim3& block) { grid = dim3(1); // We only need as many threads as there were blocks originally - block = dim3(getTwoPassBlocks(state, elements)); + block = dim3((int) THCRoundUp(getTwoPassBlocks(state, elements), (ptrdiff_t) 32)); } template inline void getSinglePassReduceBlockGrid(ptrdiff_t elements, dim3& grid, dim3& block) { grid = dim3(1); - block = dim3(THC_REDUCE_ALL_BLOCK_SIZE); + block = dim3(THCRoundUp((int) std::min(elements, (ptrdiff_t) THC_REDUCE_ALL_BLOCK_SIZE), 32)); } template (state, totalElements, grid, block); - size_t smemSize = block.x * sizeof(AccT); + size_t smemSize = reduceSmemSize(state, block.x); kernelReduceAllPass1 <<>>( @@ -209,7 +209,7 @@ void callReduceAll(THCState* state, int numPass1Blocks = grid.x; getPass2ReduceBlockGrid(state, totalElements, grid, block); - smemSize = block.x * sizeof(AccT); + smemSize = reduceSmemSize(state, block.x); kernelReduceAllPass2 <<>>( @@ -221,7 +221,7 @@ void callReduceAll(THCState* state, } } else { getSinglePassReduceBlockGrid(totalElements, grid, block); - size_t smemSize = block.x * sizeof(AccT); + size_t smemSize = reduceSmemSize(state, block.x); kernelReduceAll <<>>( diff --git a/lib/THC/THCReduceApplyUtils.cuh b/lib/THC/THCReduceApplyUtils.cuh index 30325de5..632e12c2 100644 --- a/lib/THC/THCReduceApplyUtils.cuh +++ b/lib/THC/THCReduceApplyUtils.cuh @@ -1,12 +1,14 @@ #ifndef THC_REDUCE_APPLY_UTILS_INC #define THC_REDUCE_APPLY_UTILS_INC +#include #include #include #include "THCGeneral.h" #include "THCTensor.h" #include "THCDeviceUtils.cuh" #include "THCTensorInfo.cuh" +#include "THCAsmUtils.cuh" // Enum that indicates whether tensor arguments are read/write or // read-only @@ -19,6 +21,108 @@ __device__ __forceinline__ IndexType getLinearBlockId() { blockIdx.x; } +// Returns the minimum size of shared memory required of performing N reductions +// across a block, with each reduction having at most numVals individual elements +// to reduce. Note that internally, because reductions operate (at the shared memory +// level) with N elements per thread in the block, so we have to use min(numvals, +// max block size) to determine this count. +template +int reduceSmemSize(THCState *state, long numVals) { + // check if we can use a warp shuffle + cudaDeviceProp *props = THCState_getCurrentDeviceProperties(state); + if (props->major >= 3) { + return props->warpSize * N * sizeof(T); + } else { + return THCRoundUp(std::min(numVals, (long) props->maxThreadsPerBlock), (long) props->warpSize) * N * sizeof(T); + } +} + +template +struct THCWarpUtils { + static __device__ __forceinline__ T shflxor(T val, unsigned int mask) { + return __shfl_xor(val, mask); + } +}; + +template <> +struct THCWarpUtils { + static __device__ __forceinline__ unsigned char shflxor(unsigned char val, unsigned int mask) { + return (unsigned char) __shfl_xor((int) val, mask); + } +}; + +template <> +struct THCWarpUtils { + static __device__ __forceinline__ char shflxor(char val, unsigned int mask) { + return (char) __shfl_xor((int) val, mask); + } +}; + +template <> +struct THCWarpUtils { + static __device__ __forceinline__ short shflxor(short val, unsigned int mask) { + return (short) __shfl_xor((int) val, mask); + } +}; + +template <> +struct THCWarpUtils { + static __device__ __forceinline__ double shflxor(double val, unsigned int mask) { + int2 a = *reinterpret_cast(&val); + a.x = __shfl_xor(a.x, mask); + a.y = __shfl_xor(a.y, mask); + return *reinterpret_cast(&a); + } +}; + +template <> +struct THCWarpUtils { + static __device__ __forceinline__ long shflxor(long val, unsigned int mask) { + int2 a = *reinterpret_cast(&val); + a.x = __shfl_xor(a.x, mask); + a.y = __shfl_xor(a.y, mask); + return *reinterpret_cast(&a); + } +}; + +template +__device__ void warpReduce(T threadVals[N], ReduceOp reduceOp) { +#pragma unroll + for (int mask = 1; mask < warpSize; mask *= 2) { +#pragma unroll + for (int i = 0; i < N; ++i) { + T neighbor = THCWarpUtils::shflxor(threadVals[i], mask); + threadVals[i] = reduceOp(threadVals[i], neighbor); + } + } +} + +template +__device__ void warpReduceBlock(T *smem, T threadVals[N], int numVals, ReduceOp reduceOp, T init) { + assert(blockDim.x % warpSize == 0); + // First, warps cooperate to reduce values within the warp + warpReduce(threadVals, reduceOp); + int lane = getLaneId(); + int warp = threadIdx.x / warpSize; + + if (lane == 0) { + +#pragma unroll + for (int i = 0; i < N; ++i) { + smem[warp + (i * warpSize)] = threadVals[i]; + } + } + __syncthreads(); + + if (warp == 0) { +#pragma unroll + for (int i = 0; i < N; ++i) { + threadVals[i] = (threadIdx.x < (blockDim.x / warpSize)) ? smem[lane + (i * warpSize)] : init; + } + warpReduce(threadVals, reduceOp); + } +} + // Reduce N values concurrently, i.e. suppose N = 2, and there are 4 threads: // (1, 2), (3, 4), (5, 6), (7, 8), then the return in threadVals for thread 0 // is (1 + 3 + 5 + 7, 2 + 4 + 6 + 8) = (16, 20) @@ -36,6 +140,9 @@ __device__ void reduceNValuesInBlock(T *smem, return; } +#if __CUDA_ARCH__ >= 300 + warpReduceBlock(smem, threadVals, numVals, reduceOp, init); +#else // We store each of the N values contiguously, so if N = 2, all values for // the first threadVal for each thread in the block are stored followed by // all of the values for the second threadVal for each thread in the block @@ -91,6 +198,7 @@ __device__ void reduceNValuesInBlock(T *smem, } } } +#endif } // Block-wide reduction in shared memory helper; only threadIdx.x == 0 will @@ -105,10 +213,13 @@ __device__ T reduceBlock(T* smem, return threadVal; } - // Block-wide reduction where each thread locally reduces N // values before letting a single warp take over - assumes -// threadVals is in registers, not shared memory +// threadVals is in registers, not shared memory. Note that +// numVals in this case is the number of values in the overall +// reduction, i.e. if there are 512 threads with N=2, and say +// there are 768 elements in the input block, then numVals is 768, +// not, say, 384 (i.e. 768 / N=2) template __device__ T reduceBlockWithNThreadLocalReductions(T *smem, T threadVals[N], @@ -125,7 +236,7 @@ __device__ T reduceBlockWithNThreadLocalReductions(T *smem, local = reduceOp(local, next); } - return reduceBlock(smem, blockDim.x < numVals ? blockDim.x : numVals, local, reduceOp, init); + return reduceBlock(smem, THCCeilDiv(numVals, N), local, reduceOp, init); } // Make sure the given tensor doesn't have too many dimensions diff --git a/lib/THC/THCTensorMode.cuh b/lib/THC/THCTensorMode.cuh index b67ac2a4..e2c30cf1 100644 --- a/lib/THC/THCTensorMode.cuh +++ b/lib/THC/THCTensorMode.cuh @@ -56,6 +56,15 @@ struct ModeUnsignedBoolPair { bool flag; }; +template <> +struct THCWarpUtils { + static __device__ __forceinline__ ModeUnsignedBoolPair shflxor(ModeUnsignedBoolPair val, unsigned int mask) { + val.val = __shfl_xor(val.val, mask); + val.flag = (bool) __shfl_xor((int) val.flag, mask); + return val; + } +}; + // In the kernel below, we have a common pattern of reducing (unsigned int, unsigned int) // pairs of data struct ModeUnsignedPair { @@ -63,6 +72,15 @@ struct ModeUnsignedPair { unsigned int index; }; +template <> +struct THCWarpUtils { + static __device__ __forceinline__ ModeUnsignedPair shflxor(ModeUnsignedPair val, unsigned int mask) { + val.val = __shfl_xor(val.val, mask); + val.index = __shfl_xor(val.index, mask); + return val; + } +}; + template struct MaxReduceOp { __host__ __device__ inline T operator()(const T& a, const T& b) { @@ -77,6 +95,15 @@ struct MatchReduceOp { } }; +template +int modeSmemSize(THCState *state) { + int sliceElementSize = sizeof(T) * Power2Size; + int sortScanSize = 2 * Power2Size * sizeof(unsigned int); + int reductionSize = reduceSmemSize(state, Power2Size); + + return sliceElementSize + (sortScanSize > reductionSize ? sortScanSize : reductionSize); +} + // The mode kernel has the following characteristics: It uses internal shared memory // buffers of Power2Size, which must be greater than the number of elements. Additionally, // there is one block for every slice to calculate the mode for, and in each block there @@ -271,6 +298,7 @@ __global__ void computeMode( // Finally, we have the mode, and an index where it occurs. We use a single thread // to place this in the appropriate output position if (tidx == 0) { + assert(match.flag); long index = TH_INDEX_BASE + match.val; unsigned int outputOffset = IndexToOffset::get(blockId, values); diff --git a/lib/THC/THCTensorTopK.cuh b/lib/THC/THCTensorTopK.cuh index 32041e3b..d1c848e8 100644 --- a/lib/THC/THCTensorTopK.cuh +++ b/lib/THC/THCTensorTopK.cuh @@ -166,18 +166,19 @@ __device__ void countRadixUsingMask(CountType counts[RadixSize], #pragma unroll for (unsigned int j = 0; j < RadixSize; ++j) { bool vote = hasVal && (digitInRadix == j); - counts[j] += __popc(__ballot(vote)); + counts[j] += vote; } } - // Now, for each warp, sum values - if (getLaneId() == 0) { + reduceNValuesInBlock, RadixSize>( + smem, counts, blockDim.x, AddOp(), 0); + + if (threadIdx.x == 0) { #pragma unroll for (unsigned int i = 0; i < RadixSize; ++i) { - atomicAdd(&smem[i], counts[i]); + smem[i] = counts[i]; } } - __syncthreads(); // For each thread, read in the total counts @@ -194,6 +195,10 @@ __device__ void countRadixUsingMask(CountType counts[RadixSize], #define RADIX_SIZE 4 // 2 ^ RADIX_BITS #define RADIX_MASK (RADIX_SIZE - 1) +int topkSmemSize(THCState *state, long sliceSize) { + return reduceSmemSize(state, sliceSize); +} + // This finds the unique value `v` that matches the pattern // ((v & desired) == desiredMask) in our sorted int format template @@ -356,7 +361,7 @@ __global__ void gatherTopK(TensorInfo input, IndexType indicesWithinSliceStride) { // Indices are limited to integer fp precision, so counts can fit in // int32, regardless of IndexType - __shared__ int smem[32]; // one per each warp, up to warp limit + extern __shared__ int smem[]; // one per each warp, up to warp limit IndexType slice = getLinearBlockId(); if (slice >= numInputSlices) { diff --git a/lib/THC/generic/THCTensorMode.cu b/lib/THC/generic/THCTensorMode.cu index 031e0acc..e8428e87 100644 --- a/lib/THC/generic/THCTensorMode.cu +++ b/lib/THC/generic/THCTensorMode.cu @@ -198,7 +198,7 @@ THC_API void THCTensor_(mode)(THCState *state, // 2. uses one block per slice, so number of slices must be less than the maximum number of blocks for // a kernel launch // 3. Can use 32-bit index math for indexing (mainly just for implementation conciseness, could be changed) - if (sliceSize <= MAX_BLOCK_SIZE && + if (sliceSize <= MAX_BLOCK_SIZE * 2 && slices <= MAX_GRID_SIZE && TensorUtils::canUse32BitIndexMath(state, input)) { // Beginning our optimized implementation. First thing we want to do is to transpose @@ -230,7 +230,7 @@ THC_API void THCTensor_(mode)(THCState *state, { \ dim3 blockSize(SIZE / 2); \ \ - int memsize = (sizeof(real) * SIZE) + (2 * SIZE * sizeof(unsigned int)); \ + int memsize = modeSmemSize(state); \ computeMode \ <<>>( \ THCTensor_(data)(state, contiguous), tiValues, tiIndices, sliceSize); \ @@ -248,15 +248,15 @@ THC_API void THCTensor_(mode)(THCState *state, HANDLE_MODE(1024) break; case 128: - case 64: HANDLE_MODE(128) break; + case 64: case 32: case 16: case 8: case 4: case 2: - HANDLE_MODE(32) + HANDLE_MODE(64) // block size should be at least 1 full warp break; case 1: default: diff --git a/lib/THC/generic/THCTensorRandom.cu b/lib/THC/generic/THCTensorRandom.cu index 4c6d2fb2..759ee095 100644 --- a/lib/THC/generic/THCTensorRandom.cu +++ b/lib/THC/generic/THCTensorRandom.cu @@ -93,10 +93,12 @@ void THCTensor_(renormRows)(struct THCState* state, int maxThreads = props->maxThreadsPerBlock; dim3 grid(rows < numSM * 4 ? rows : numSM * 4); - dim3 block(cols < maxThreads ? cols : maxThreads); + dim3 block(THCRoundUp(cols < maxThreads ? cols : maxThreads, (long) props->warpSize)); + + int smem = reduceSmemSize(state, cols); renormRowsL1 - <<>>(THCTensor_(data)(state, t), rows, cols); } @@ -157,11 +159,13 @@ THC_API void THCTensor_(multinomial)(struct THCState *state, THAssert(props != NULL); int numSM = props->multiProcessorCount; int maxThreads = props->maxThreadsPerBlock; - dim3 block(numCategories < maxThreads ? numCategories : maxThreads); + dim3 block(THCRoundUp(numCategories < maxThreads ? numCategories : maxThreads, props->warpSize)); dim3 grid(numDist < numSM * 4 ? numDist : numSM * 4); + // smem required for reduction + prefix sums + int smemSize = (block.x * sizeof(real)) + reduceSmemSize(state, numCategories); sampleMultinomialOnce <<>>( THCudaLongTensor_data(state, self), numDist, diff --git a/lib/THC/generic/THCTensorTopK.cu b/lib/THC/generic/THCTensorTopK.cu index 83ab1e10..ab91cb4a 100644 --- a/lib/THC/generic/THCTensorTopK.cu +++ b/lib/THC/generic/THCTensorTopK.cu @@ -28,9 +28,11 @@ THC_API void THCTensor_(topk)(THCState* state, THCudaLongTensor_resize(state, indices, topKSize, NULL); THLongStorage_free(topKSize); + int smem = topkSmemSize(state, sliceSize); + #define RUN_K(INDEX_T, DIM, DIR) \ gatherTopK \ - <<>>( \ + <<>>( \ inputInfo, \ sliceSize, \ k, \