From a4c28344b9b1afa2d2ba7670ce5c974fb007929c Mon Sep 17 00:00:00 2001 From: Garvit Banga Date: Sat, 20 Jul 2024 17:11:11 -0400 Subject: [PATCH 01/44] HOROVOD new version compatible --- .DS_Store | Bin 0 -> 8196 bytes examples/.DS_Store | Bin 0 -> 6148 bytes .../tensorflow/tensorflow2_keras_mnist.py | 15 +- examples/tensorflow/tensorflow2_mnist.py | 12 +- examples/tensorflow/tensorflow_keras_mnist.py | 34 +- .../__pycache__/pytorch_mnist.cpython-310.pyc | Bin 0 -> 6384 bytes examples/torch/hvd.py | 289 +++++++ examples/torch/hvd1.py | 92 +++ examples/torch/pytorch_mnist.py | 400 ++++++---- grace_dl/.DS_Store | Bin 0 -> 6148 bytes grace_dl/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 185 bytes grace_dl/tensorflow/.DS_Store | Bin 0 -> 8196 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 4998 bytes .../__pycache__/allgather.cpython-310.pyc | Bin 0 -> 1622 bytes grace_dl/tensorflow/communicator/allgather.py | 2 +- .../__pycache__/topk.cpython-310.pyc | Bin 0 -> 1969 bytes grace_dl/tensorflow/compressor/topk.py | 2 +- .../__pycache__/residual.cpython-310.pyc | Bin 0 -> 1295 bytes grace_dl/tensorflow/memory/residual.py | 6 +- grace_dl/torch/.DS_Store | Bin 0 -> 6148 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 2858 bytes .../torch/__pycache__/helper.cpython-310.pyc | Bin 0 -> 3310 bytes .../__pycache__/allgather.cpython-310.pyc | Bin 0 -> 1961 bytes .../__pycache__/allreduce.cpython-310.pyc | Bin 0 -> 1111 bytes .../__pycache__/broadcast.cpython-310.pyc | Bin 0 -> 1549 bytes grace_dl/torch/communicator/allgather.py | 2 +- grace_dl/torch/communicator/allreduce.py | 2 +- .../__pycache__/fp16.cpython-310.pyc | Bin 0 -> 974 bytes .../__pycache__/randomk.cpython-310.pyc | Bin 0 -> 1897 bytes .../__pycache__/topk.cpython-310.pyc | Bin 0 -> 1654 bytes grace_dl/torch/compressor/fp16.py | 6 +- grace_dl/torch/compressor/randomk.py | 2 +- grace_dl/torch/compressor/topk.py | 2 +- grace_dl/torch/helper.py | 4 + .../memory/__pycache__/none.cpython-310.pyc | Bin 0 -> 749 bytes .../__pycache__/residual.cpython-310.pyc | Bin 0 -> 1093 bytes grace_dl/torch/memory/residual.py | 2 +- patch_files/.DS_Store | Bin 0 -> 6148 bytes patch_files/horovod/.DS_Store | Bin 0 -> 6148 bytes patch_files/horovod/torch/.DS_Store | Bin 0 -> 6148 bytes patch_files/horovod/torch/__init__.py | 57 +- patch_files/horovod/torch/mpi_ops.py | 715 +++++++++++++++--- patch_files/horovod/torch/optimizer.py | 173 ++++- 43 files changed, 1462 insertions(+), 355 deletions(-) create mode 100644 .DS_Store create mode 100644 examples/.DS_Store create mode 100644 examples/torch/__pycache__/pytorch_mnist.cpython-310.pyc create mode 100644 examples/torch/hvd.py create mode 100644 examples/torch/hvd1.py create mode 100644 grace_dl/.DS_Store create mode 100644 grace_dl/__pycache__/__init__.cpython-310.pyc create mode 100644 grace_dl/tensorflow/.DS_Store create mode 100644 grace_dl/tensorflow/__pycache__/__init__.cpython-310.pyc create mode 100644 grace_dl/tensorflow/communicator/__pycache__/allgather.cpython-310.pyc create mode 100644 grace_dl/tensorflow/compressor/__pycache__/topk.cpython-310.pyc create mode 100644 grace_dl/tensorflow/memory/__pycache__/residual.cpython-310.pyc create mode 100644 grace_dl/torch/.DS_Store create mode 100644 grace_dl/torch/__pycache__/__init__.cpython-310.pyc create mode 100644 grace_dl/torch/__pycache__/helper.cpython-310.pyc create mode 100644 grace_dl/torch/communicator/__pycache__/allgather.cpython-310.pyc create mode 100644 grace_dl/torch/communicator/__pycache__/allreduce.cpython-310.pyc create mode 100644 grace_dl/torch/communicator/__pycache__/broadcast.cpython-310.pyc create mode 100644 grace_dl/torch/compressor/__pycache__/fp16.cpython-310.pyc create mode 100644 grace_dl/torch/compressor/__pycache__/randomk.cpython-310.pyc create mode 100644 grace_dl/torch/compressor/__pycache__/topk.cpython-310.pyc create mode 100644 grace_dl/torch/memory/__pycache__/none.cpython-310.pyc create mode 100644 grace_dl/torch/memory/__pycache__/residual.cpython-310.pyc create mode 100644 patch_files/.DS_Store create mode 100644 patch_files/horovod/.DS_Store create mode 100644 patch_files/horovod/torch/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..8e2c11e11d140309e12d25b0f48ef8971855e5d3 GIT binary patch literal 8196 zcmeHM&x_MQ6n@jK?v$bwTy^&r@ZhCj>$WW7CD!^67}0}DO`6&U(@bbuE0jX+diQVf zE(_i~c~?*RH+b;Pk9C_S^#_8w_y%UaWahnO-uES$z9|5(&dhHD>;eGCDzR3@rl81i zS#zaidd?v-^aqF`gaIVpJc)DG)Ikgo1H=F^KnxHA{{jQJX3N4itowSDT4I10_%9id z-vFyqQhiovuyj$6_k(KeWB z)b7BvJ1}!+W-=6}?v8$zG6!aBq?Q;U2J#HZ+?`|2{tR3gujcPwC_Ef{<;7u;OdE}N zth7}=b$Xj`^D2MnjKtiTy3<+GaVPiX*&QK5@95a|?goQ-ueNhjL{m421{38F_yaU~ za4(2_G4F_3-|jsL z$B}rhV`1id0B6=>dr=gapY%fyCn{}oQHR5((-&jCbHP5&ifqQB+cnJa_VO^fxOaG? z*v|hcyLhy6&ytOo%FpYmGl@bG4RK5c%V7u;RI87w^}vTv?za40wVG0`_V25;7FbrT zwZV*9)=RgDnN_u3zmMNsI8tlvT4Qmn*7xi(YF$^j>;k&YKEP6dZ7Cw&I$KU+# zSNA?YsOp2?>L;Vp-oQShD{MgqAtX4nfGNC!7%}(rbfV|^6zibcuXHl5*A91*}AegI%_$W6D%ZTS16P&Sn1 `lr = 1.0 * hvd.size()` during # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, verbose=1), + hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, verbose=1,initial_lr=0.01), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. @@ -93,4 +94,4 @@ # Train the model. # Horovod: adjust number of steps based on number of GPUs. -mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=24, verbose=verbose) +mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=24, verbose=verbose) \ No newline at end of file diff --git a/examples/tensorflow/tensorflow2_mnist.py b/examples/tensorflow/tensorflow2_mnist.py index e61bbcc..515705b 100644 --- a/examples/tensorflow/tensorflow2_mnist.py +++ b/examples/tensorflow/tensorflow2_mnist.py @@ -14,11 +14,15 @@ # ============================================================================== import tensorflow as tf +import sys +sys.path.append("/content/") +sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/") +sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") import horovod.tensorflow as hvd -from grace_dl.tensorflow.communicator.allgather import Allgather -from grace_dl.tensorflow.compressor.topk import TopKCompressor -from grace_dl.tensorflow.memory.residual import ResidualMemory +from grace.grace_dl.tensorflow.communicator.allgather import Allgather +from grace.grace_dl.tensorflow.compressor.topk import TopKCompressor +from grace.grace_dl.tensorflow.memory.residual import ResidualMemory # Horovod: initialize Horovod. hvd.init() @@ -68,7 +72,7 @@ def training_step(images, labels, first_batch): loss_value = loss(labels, probs) # Horovod: add Horovod Distributed GradientTape. - tape = hvd.DistributedGradientTape(tape, grace=grc) + tape = hvd.DistributedGradientTape(tape, grc) grads = tape.gradient(loss_value, mnist_model.trainable_variables) opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) diff --git a/examples/tensorflow/tensorflow_keras_mnist.py b/examples/tensorflow/tensorflow_keras_mnist.py index 2d6fce1..a558130 100644 --- a/examples/tensorflow/tensorflow_keras_mnist.py +++ b/examples/tensorflow/tensorflow_keras_mnist.py @@ -1,28 +1,44 @@ +# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== from __future__ import print_function import math - +import sys +sys.path.append("/content/") +print(sys.path) import tensorflow as tf from tensorflow import keras -from tensorflow.keras import backend as K +from tensorflow.compat.v1.keras import backend as K from tensorflow.keras.datasets import mnist from tensorflow.keras.layers import Conv2D, MaxPooling2D from tensorflow.keras.layers import Dense, Dropout, Flatten from tensorflow.keras.models import Sequential import horovod.tensorflow.keras as hvd -from grace_dl.tensorflow.communicator.allgather import Allgather -from grace_dl.tensorflow.compressor.topk import TopKCompressor -from grace_dl.tensorflow.memory.residual import ResidualMemory +from grace.grace_dl.tensorflow.communicator.allgather import Allgather +from grace.grace_dl.tensorflow.compressor.topk import TopKCompressor +from grace.grace_dl.tensorflow.memory.residual import ResidualMemory # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.ConfigProto() +config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) -K.set_session(tf.Session(config=config)) +K.set_session(tf.compat.v1.Session(config=config)) batch_size = 128 num_classes = 10 @@ -76,7 +92,7 @@ grc = Allgather(TopKCompressor(0.3), ResidualMemory(), hvd.size()) # Horovod: add Horovod Distributed Optimizer. -opt = hvd.DistributedOptimizer(opt, grace=grc) +opt = hvd.DistributedOptimizer(opt, grc) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, @@ -101,4 +117,4 @@ validation_data=(x_test, y_test)) score = model.evaluate(x_test, y_test, verbose=0) print('Test loss:', score[0]) -print('Test accuracy:', score[1]) +print('Test accuracy:', score[1]) \ No newline at end of file diff --git a/examples/torch/__pycache__/pytorch_mnist.cpython-310.pyc b/examples/torch/__pycache__/pytorch_mnist.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ead69b23e0fe9b0ccb36728fa5ec5621661c976 GIT binary patch literal 6384 zcmbtYO^h4Kb?&bIVYB%&)C@KHmquP`wcH<0Bsum z`iH7+&aOrz5@3W3b&#E75^MkxFpTWQ2VV>aIRv@nkV6poRyQL7fl;&(8#D0cw0VfO4$(Hzs-M-e_WG;0-Lmt(femd1Zt=p5?YDNKVyigs z=|)AS=~`53mFJvFRBct~^g^o!eNK2JT5K&wORc3jzuu~=agIi{*0SOi!(-9$)^SC< z;fd&E>!hMf;i>3!>$IZF;hE@c>#U+#SZmd8X}rR#k2PKmiv4rjMr(yH@Y-Xobw2nC zKeEBZ-v+NtFU+*o#o%K9)v}iSMNsQsn!YyEf{WfI@EYQ;gV*`uHmy~?|3+{=xY*H& zkLOK1Z}FuKrrz}p#*aR=F*dua1AK9Fzou-dLZT% z5%6I0E2E9j&8t zowLVI%isn#v1jIvW@;@9t7%DNEy&t1@ckdr79LWTevBg1GJT|NvniXw{4%3&k~&iL z)EH@=r8o?RyT`FJ81hVm%mG#8HkmYPuBsI{Q?(M;Fac+6mj>J=SKOsx@tLKKK*T}l zrM<0Sw@!jdT*7nG0na2Ycq)Venivf-@ub@ zqR6zVjt6TzHAeaj+f5KMCf*+3gv=t@fpnx#ZQ92f%UhyFA#XwTzB9tE_lwwN$RTq{ zrdpPWBI8-xrK#*b!%r-L!V)?#Zfr^1#ZKw(OH%~lP`Z)-nKww1@Cp}4(M}wvVi|=r zH+#XS;v|s;+81n;c!i{=h*b)cu9qgA34yp1QdQ8*e#b`5^ zTb^ml8YG3RGOglSm}w|$+~uX2ninrWHOCCPkMtM&Oa1zYq1REgQhj;lsqq9(t)$B% z*3mKAv5Cv8pRdhraL$hiAot|wN7h~q@1^i7(JNz$ZFPQ>FY{xc8{@*r9u>CrCk*oA;f3hp*n#F( z`H5|PT-?!i^c}Wi?pQnaPI1TWIQ%3(_1GD^;M*O>PxCX6ohJs`8LCxb$LZMoEI)@f zrR)+n@O`gw;}`(^ruKhc{E=6?xXQ4Uavj6 z+WfW7_+sOe;_W0&A-dZ9jn4RAFc5yi7|1f^2{;V|V$%<$rA`LYrWD)Hq!9+OG<#VPHHy-jkENDDP2JrmFU!)N z+v7kXi#h1@_-E22P?L7%i*Arf8%fGwm`Ri7(r}~+Hs%y;1~Gx8G`gZKEo2r!2=JiS zWm!l#SHiM_KYWJIv2K;THixu>oJK9q=lJOKHj3qfsoT$%SzgxkoO+$P`m#a1*O{Zc ztghS08*2t=-2tbpJ1PTmSe4nvt7ymm$o$u>dIa0)G~Wr6wjZYNHq{KC%P(o$=~007 zO%Mo`$N-W3v5ZX(3V(1329P5AhX>nxq3rOjaQO@*fX{PeI zX+`zY9~^_e>I4XOhIHurv_63u@ZvTbYoK)Qg3_}EUYgdxFOPtTGt3NM$G8?A8QZ|@ zMPA8oGqCy9Lz~~-TfO%w#~{qv+YXg|+yl3;5d;I?i&_qCDLGrKOqAbl4{@x2xb;={ zUXW%h@VToib7)ziW(xSe))~LKa(&)ZajpOwT|o{d=(N>XI%HyIKo<2Kk`dO3?@~d= zCe~3%0|yYvf(Nod=M@jBiP0VmrPYBS!{GsP!l%X_6@NfIOh99R!)l0cXcSt-3^UEY zp`ysWk5KwWkNWjhKSF!44YFa2kk}8YNT?W~kfpgMg&$>A`6c+k=os#w(L8OUpmUG| z^e}-53L{kgIIFV@df6=NuI1`>!2v=5EA*|U7c8H2N&Rx8q1H>f-^vrhyMQ{Aj^ARa z<`wxd4p0EOfM6M=T2AQ_!t-FW*AAqsKu0BIa)J1i?PcpCz%7iRd{bI?*1mD~o^pFZ zxuz`84M>s((z%!1Q@|jLYXs~eOh#J6tQV&F9+mh(WRi=A!qnY7VHDpceeO*_`E7;1 z;wmu~)Dj*fnCj>z?m#5gsQ8cy3Xdu#%KCkQ&PYxXm-oXbJ~Bhx@_4VE$rA@H_Ct%O z<`HIo`_#$}W?mDVlmt{z*7*~DCK6UfHfxW z^9GuO?H`k1hYCSOO2rBa#6=1}g!t)W6jc~pjm*-lfpU#HTcj`t12unPK4Z_^LRFb1 z)7djy2M4vfNpj~GklG}zF*cXjmW{p?{PMt<2g*fr5mzczW$FlvMMpnbsD5?%X#I5x zo2D*iG%;KY(9}F{>%vFFn65BZRA^Mhx=4oI^_BHRB%29uiXk$JZkQwhhC#1k6nA_MWk0GxfhcOUO4kjYVKx-FZV+^F0^E3rbI1ULeONWw z?4`(mlt0dWaQ@x0vbO-LT@D9tyv$imMk%}s0n&hlh#!)+%E76Tl1h_o?9eUJK~mzplix+Xi~K=L33|#JC2{%R sK{5xcW@V&!d17}Uqx#~Cv#9-{^MRuy4MYk9Ur~SR<^FfsG5`I40Ar=-aR2}S literal 0 HcmV?d00001 diff --git a/examples/torch/hvd.py b/examples/torch/hvd.py new file mode 100644 index 0000000..0865b9c --- /dev/null +++ b/examples/torch/hvd.py @@ -0,0 +1,289 @@ +import sys +sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/") +sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") +import argparse +import os +from packaging import version + +import torch.multiprocessing as mp +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data.distributed +from filelock import FileLock +from torchvision import datasets, transforms + +import horovod +import horovod.torch as hvd + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') +parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') +parser.add_argument('--epochs', type=int, default=2, metavar='N', + help='number of epochs to train (default: 10)') +parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') +parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') +parser.add_argument('--seed', type=int, default=42, metavar='S', + help='random seed (default: 42)') +parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') +parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') +parser.add_argument('--use-mixed-precision', action='store_true', default=False, + help='use mixed precision for training') +parser.add_argument('--use-adasum', action='store_true', default=False, + help='use adasum algorithm to do reduction') +parser.add_argument('--gradient-predivide-factor', type=float, default=1.0, + help='apply gradient predivide factor in optimizer (default: 1.0)') +parser.add_argument('--data-dir', + help='location of the training dataset in the local filesystem (will be downloaded if needed)') + +# Arguments when not run through horovodrun +parser.add_argument('--num-proc', type=int) +parser.add_argument('--hosts', help='hosts to run on in notation: hostname:slots[,host2:slots[,...]]') +parser.add_argument('--communication', help='collaborative communication to use: gloo, mpi') + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x) + + +def main(args): + def train_mixed_precision(epoch, scaler): + model.train() + # Horovod: set epoch to sampler for shuffling. + train_sampler.set_epoch(epoch) + for batch_idx, (data, target) in enumerate(train_loader): + if args.cuda: + data, target = data.cuda(), target.cuda() + optimizer.zero_grad() + with torch.cuda.amp.autocast(): + output = model(data) + loss = F.nll_loss(output, target) + + scaler.scale(loss).backward() + # Make sure all async allreduces are done + optimizer.synchronize() + # In-place unscaling of all gradients before weights update + scaler.unscale_(optimizer) + with optimizer.skip_synchronize(): + scaler.step(optimizer) + # Update scaler in case of overflow/underflow + scaler.update() + + if batch_idx % args.log_interval == 0: + # Horovod: use train_sampler to determine the number of examples in + # this worker's partition. + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLoss Scale: {}'.format( + epoch, batch_idx * len(data), len(train_sampler), + 100. * batch_idx / len(train_loader), loss.item(), scaler.get_scale())) + + def train_epoch(epoch): + model.train() + # Horovod: set epoch to sampler for shuffling. + train_sampler.set_epoch(epoch) + for batch_idx, (data, target) in enumerate(train_loader): + if args.cuda: + data, target = data.cuda(), target.cuda() + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + # Horovod: use train_sampler to determine the number of examples in + # this worker's partition. + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_sampler), + 100. * batch_idx / len(train_loader), loss.item())) + + def metric_average(val, name): + tensor = torch.tensor(val) + if(hvd.rank()%2==0): + ps=hvd.ProcessSet([0,2]) + avg_tensor = hvd.allreduce(tensor, name=name,process_set=even_set) + return avg_tensor.item() + else: + ps=hvd.ProcessSet([1,3]) + avg_tensor = hvd.allreduce(tensor, name=name,process_set=odd_set) + return avg_tensor.item() + + def test(): + model.eval() + test_loss = 0. + test_accuracy = 0. + for data, target in test_loader: + if args.cuda: + data, target = data.cuda(), target.cuda() + output = model(data) + # sum up batch loss + test_loss += F.nll_loss(output, target, size_average=False).item() + # get the index of the max log-probability + pred = output.data.max(1, keepdim=True)[1] + test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() + + # Horovod: use test_sampler to determine the number of examples in + # this worker's partition. + test_loss /= len(test_sampler) + test_accuracy /= len(test_sampler) + + # Horovod: average metric values across workers. + test_loss = metric_average(test_loss, 'avg_loss') + test_accuracy = metric_average(test_accuracy, 'avg_accuracy') + + # Horovod: print output only on first rank. + # if hvd.rank() == 0: + print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( + test_loss, 100. * test_accuracy)) + + # Horovod: initialize library. + even_set = hvd.ProcessSet([0,2]) + odd_set = hvd.ProcessSet([1,3]) + hvd.init(process_sets=[even_set, odd_set]) + print("HERE") + torch.manual_seed(args.seed) + + print("args.cuda",args.cuda) + print("torch.cuda.is_available()",torch.cuda.is_available()) + # if args.cuda: + # # Horovod: pin GPU to local rank. + # torch.cuda.set_device(hvd.local_rank()) + # torch.cuda.manual_seed(args.seed) + # else: + # if args.use_mixed_precision: + # raise ValueError("Mixed precision is only supported with cuda enabled.") + + if (args.use_mixed_precision and version.parse(torch.__version__) + < version.parse('1.6.0')): + raise ValueError("""Mixed precision is using torch.cuda.amp.autocast(), + which requires torch >= 1.6.0""") + + # Horovod: limit # of CPU threads to be used per worker. + torch.set_num_threads(1) + + kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} + # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent + # issues with Infiniband implementations that are not fork-safe + if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and + mp._supports_context and 'forkserver' in mp.get_all_start_methods()): + kwargs['multiprocessing_context'] = 'forkserver' + + data_dir = args.data_dir or './data' + with FileLock(os.path.expanduser("~/.horovod_lock")): + train_dataset = \ + datasets.MNIST(data_dir, train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) + + # Horovod: use DistributedSampler to partition the training data. + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) + + test_dataset = \ + datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) + # Horovod: use DistributedSampler to partition the test data. + test_sampler = torch.utils.data.distributed.DistributedSampler( + test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) + test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, + sampler=test_sampler, **kwargs) + + model = Net() + + # By default, Adasum doesn't need scaling up learning rate. + lr_scaler = hvd.size() if not args.use_adasum else 1 + + if args.cuda: + # Move model to GPU. + model.cuda() + # If using GPU Adasum allreduce, scale learning rate by local_size. + if args.use_adasum and hvd.nccl_built(): + lr_scaler = hvd.local_size() + + # Horovod: scale learning rate by lr_scaler. + optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, + momentum=args.momentum) + + # Horovod: broadcast parameters & optimizer state. + hvd.broadcast_parameters(model.state_dict(), root_rank=0) + hvd.broadcast_optimizer_state(optimizer, root_rank=0) + + # Horovod: (optional) compression algorithm. + compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none + from grace.grace_dl.torch.helper import grace_from_params + params = {'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather'} + grc = grace_from_params(params) + # Horovod: wrap optimizer with DistributedOptimizer. + if(hvd.rank()%2==0): + optimizer = hvd.DistributedOptimizer(optimizer, + named_parameters=model.named_parameters(), + grace=grc, + # compression=compression, + op=hvd.Adasum if args.use_adasum else hvd.Average, + gradient_predivide_factor=args.gradient_predivide_factor, + process_set=even_set) + else: + optimizer = hvd.DistributedOptimizer(optimizer, + named_parameters=model.named_parameters(), + grace=grc, + # compression=compression, + op=hvd.Adasum if args.use_adasum else hvd.Average, + gradient_predivide_factor=args.gradient_predivide_factor, + process_set=odd_set) + if args.use_mixed_precision: + # Initialize scaler in global scale + scaler = torch.cuda.amp.GradScaler() + + for epoch in range(1, args.epochs + 1): + if args.use_mixed_precision: + train_mixed_precision(epoch, scaler) + else: + train_epoch(epoch) + # Keep test in full precision since computation is relatively light. + test() + + +if __name__ == '__main__': + args = parser.parse_args() + args.cuda = not args.no_cuda and torch.cuda.is_available() + + if args.num_proc: + # run training through horovod.run + print('Running training through horovod.run') + horovod.run(main, + args=(args,), + np=args.num_proc, + hosts=args.hosts, + use_gloo=args.communication == 'gloo', + use_mpi=args.communication == 'mpi') + else: + # this is running via horovodrun + main(args) \ No newline at end of file diff --git a/examples/torch/hvd1.py b/examples/torch/hvd1.py new file mode 100644 index 0000000..8475f57 --- /dev/null +++ b/examples/torch/hvd1.py @@ -0,0 +1,92 @@ +from __future__ import print_function +from multiprocessing import freeze_support +import sys +sys.path.append("/content/") +sys.path.append("/content/grace/") +sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/") +sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") +import argparse +import sys +import os +# print(sys.path) +sys.path.append(os.path.join('/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/', 'grace/')) +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data.distributed +from torchvision import datasets, transforms +import horovod +import horovod.torch as hvd +from grace.grace_dl.torch.communicator.allgather import Allgather +from grace.grace_dl.torch.communicator.allreduce import Allreduce +from grace.grace_dl.torch.compressor.topk import TopKCompressor +from grace.grace_dl.torch.memory.residual import ResidualMemory +import torch +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') +parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') +parser.add_argument('--epochs', type=int, default=2, metavar='N', + help='number of epochs to train (default: 10)') +parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') +parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') +parser.add_argument('--seed', type=int, default=42, metavar='S', + help='random seed (default: 42)') +parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') +parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') +parser.add_argument('--num-proc', type=int) +# parser.add_argument('--hosts', help='hosts to run on in notation: hostname:slots[,host2:slots[,...]]') +parser.add_argument('--communication', help='collaborative communication to use: gloo, mpi') +parser.add_argument('--use-adasum', action='store_true', default=False, + help='use adasum algorithm to do reduction') + +parser.add_argument('--gradient-predivide-factor', type=float, default=1.0, + help='apply gradient predivide factor in optimizer (default: 1.0)') + +args = parser.parse_args() +args.cuda = not args.no_cuda and torch.cuda.is_available() +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x) + + +if __name__ == '__main__': + freeze_support() + args = parser.parse_args() + args.cuda = not args.no_cuda and torch.cuda.is_available() + import grace.examples.torch.pytorch_mnist as pmt + if args.num_proc: + # run training through horovod.run + print('Running training through horovod.run1') + ans=horovod.run(pmt.mainf, + args=(args,), + np=args.num_proc, + # hosts=args.hosts, + use_gloo=True, + use_mpi=args.communication == 'mpi') + print("ans",ans) + else: + # this is running via horovodrun + main(args) \ No newline at end of file diff --git a/examples/torch/pytorch_mnist.py b/examples/torch/pytorch_mnist.py index 00b71e5..82c53cf 100644 --- a/examples/torch/pytorch_mnist.py +++ b/examples/torch/pytorch_mnist.py @@ -1,78 +1,57 @@ from __future__ import print_function - +import sys +sys.path.append("/content/") +sys.path.append("/content/grace/") +sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/") +sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") import argparse - +import sys +import os +# print(sys.path) +sys.path.append(os.path.join('/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/', 'grace/')) import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.utils.data.distributed from torchvision import datasets, transforms - +import horovod import horovod.torch as hvd -from grace_dl.torch.communicator.allgather import Allgather -from grace_dl.torch.compressor.topk import TopKCompressor -from grace_dl.torch.memory.residual import ResidualMemory - +from grace.grace_dl.torch.communicator.allgather import Allgather +from grace.grace_dl.torch.communicator.allreduce import Allreduce +from grace.grace_dl.torch.compressor.topk import TopKCompressor +from grace.grace_dl.torch.memory.residual import ResidualMemory +import torch # Training settings -parser = argparse.ArgumentParser(description='PyTorch MNIST Example') -parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') -parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') -parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 10)') -parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') -parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') -parser.add_argument('--seed', type=int, default=42, metavar='S', - help='random seed (default: 42)') -parser.add_argument('--log-interval', type=int, default=10, metavar='N', - help='how many batches to wait before logging training status') -parser.add_argument('--fp16-allreduce', action='store_true', default=False, - help='use fp16 compression during allreduce') -args = parser.parse_args() -args.cuda = not args.no_cuda and torch.cuda.is_available() - -# Horovod: initialize library. -hvd.init() -torch.manual_seed(args.seed) - -if args.cuda: - # Horovod: pin GPU to local rank. - torch.cuda.set_device(hvd.local_rank()) - torch.cuda.manual_seed(args.seed) - -# Horovod: limit # of CPU threads to be used per worker. -torch.set_num_threads(1) - -kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} -train_dataset = \ - datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) -# Horovod: use DistributedSampler to partition the training data. -train_sampler = torch.utils.data.distributed.DistributedSampler( - train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) -train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) - -test_dataset = \ - datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) -# Horovod: use DistributedSampler to partition the test data. -test_sampler = torch.utils.data.distributed.DistributedSampler( - test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) -test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, - sampler=test_sampler, **kwargs) +# parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +# parser.add_argument('--batch-size', type=int, default=64, metavar='N', +# help='input batch size for training (default: 64)') +# parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', +# help='input batch size for testing (default: 1000)') +# parser.add_argument('--epochs', type=int, default=2, metavar='N', +# help='number of epochs to train (default: 10)') +# parser.add_argument('--lr', type=float, default=0.01, metavar='LR', +# help='learning rate (default: 0.01)') +# parser.add_argument('--momentum', type=float, default=0.5, metavar='M', +# help='SGD momentum (default: 0.5)') +# parser.add_argument('--no-cuda', action='store_true', default=False, +# help='disables CUDA training') +# parser.add_argument('--seed', type=int, default=42, metavar='S', +# help='random seed (default: 42)') +# parser.add_argument('--log-interval', type=int, default=10, metavar='N', +# help='how many batches to wait before logging training status') +# parser.add_argument('--fp16-allreduce', action='store_true', default=False, +# help='use fp16 compression during allreduce') +# parser.add_argument('--num-proc', type=int) +# # parser.add_argument('--hosts', help='hosts to run on in notation: hostname:slots[,host2:slots[,...]]') +# parser.add_argument('--communication', help='collaborative communication to use: gloo, mpi') +# parser.add_argument('--use-adasum', action='store_true', default=False, +# help='use adasum algorithm to do reduction') +# parser.add_argument('--gradient-predivide-factor', type=float, default=1.0, +# help='apply gradient predivide factor in optimizer (default: 1.0)') +# args = parser.parse_args() +# args.cuda = not args.no_cuda and torch.cuda.is_available() class Net(nn.Module): def __init__(self): super(Net, self).__init__() @@ -91,86 +70,221 @@ def forward(self, x): x = self.fc2(x) return F.log_softmax(x) +def mainf(args): + # Horovod: initialize library. + even_set = hvd.ProcessSet([0,2]) + odd_set = hvd.ProcessSet([1,3]) + hvd.init(process_sets=[even_set, odd_set]) + torch.manual_seed(args.seed) + # print("hvd.local_rank()",hvd.local_rank()) + # print("hvd.global_rank()",hvd.global_rank())hvd. + # print("hvd.rank()",hvd.rank()) + if args.cuda: + # Horovod: pin GPU to local rank. + torch.cuda.set_device(hvd.local_rank()) + torch.cuda.manual_seed(args.seed) + + # Horovod: limit # of CPU threads to be used per worker. + torch.set_num_threads(4) + + kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} + train_dataset = \ + datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) + # Horovod: use DistributedSampler to partition the training data. + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) + + test_dataset = \ + datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) + # Horovod: use DistributedSampler to partition the test data. + test_sampler = torch.utils.data.distributed.DistributedSampler( + test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) + test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, + sampler=test_sampler, **kwargs) + + + + model = Net() + model1=Net() + if args.cuda: + # Move model to GPU. + model.cuda() + model1.cuda() + + # Horovod: scale learning rate by the number of GPUs. + optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), + momentum=args.momentum) + optimizer1 = optim.SGD(model1.parameters(), lr=args.lr * hvd.size(), + momentum=args.momentum) + + # Horovod: broadcast parameters & optimizer state. + hvd.broadcast_parameters(model.state_dict(), root_rank=0) + hvd.broadcast_parameters(model1.state_dict(), root_rank=0) + hvd.broadcast_optimizer_state(optimizer, root_rank=0)#,process_set=even_set) + hvd.broadcast_optimizer_state(optimizer1, root_rank=0)#,process_set=even_set) + # GRACE: compression algorithm. + # grc = Allgather(TopKCompressor(0.3), ResidualMemory(), hvd.size()) + + from grace.grace_dl.torch.helper import grace_from_params + params = {'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather'} + grc = grace_from_params(params) + # print("grc",grc) + + # Horovod: wrap optimizer with DistributedOptimizer. + # compression = Allgather(TopKCompressor(0.01), NoneMemory()) + # compression = Allreduce(RandomKCompressor(), ResidualMemory()) + # hvd.Compression.fp16 + + # + from grace.grace_dl.torch.compressor.randomk import RandomKCompressor + cpr=TopKCompressor(0.1) + # if(hvd.rank()%2==0): + # optimizer = hvd.DistributedOptimizer(optimizer, model.named_parameters(),grace=grc)#,process_set=even_set#hvd.Compression.fp16) + # else: + # optimizer = hvd.DistributedOptimizer(optimizer, model.named_parameters(),grace=grc,process_set=odd_set)#hvd.Compression.fp16) -model = Net() - -if args.cuda: - # Move model to GPU. - model.cuda() - -# Horovod: scale learning rate by the number of GPUs. -optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), - momentum=args.momentum) - -# Horovod: broadcast parameters & optimizer state. -hvd.broadcast_parameters(model.state_dict(), root_rank=0) -hvd.broadcast_optimizer_state(optimizer, root_rank=0) - -# GRACE: compression algorithm. -# grc = Allgather(TopKCompressor(0.3), ResidualMemory(), hvd.size()) - -from grace_dl.torch.helper import grace_from_params -params = {'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather'} -grc = grace_from_params(params) - -# Horovod: wrap optimizer with DistributedOptimizer. -optimizer = hvd.DistributedOptimizer(optimizer, grc, named_parameters=model.named_parameters()) - - -def train(epoch): - model.train() - # Horovod: set epoch to sampler for shuffling. - train_sampler.set_epoch(epoch) - for batch_idx, (data, target) in enumerate(train_loader): - if args.cuda: - data, target = data.cuda(), target.cuda() - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - if batch_idx % args.log_interval == 0: - # Horovod: use train_sampler to determine the number of examples in - # this worker's partition. - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) - - -def metric_average(val, name): - tensor = torch.tensor(val) - avg_tensor = hvd.allreduce(tensor, name=name) - return avg_tensor.item() - - -def test(): - model.eval() - test_loss = 0. - test_accuracy = 0. - for data, target in test_loader: - if args.cuda: - data, target = data.cuda(), target.cuda() - output = model(data) - # sum up batch loss - test_loss += F.nll_loss(output, target, size_average=False).item() - # get the index of the max log-probability - pred = output.data.max(1, keepdim=True)[1] - test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() - - # Horovod: use test_sampler to determine the number of examples in - # this worker's partition. - test_loss /= len(test_sampler) - test_accuracy /= len(test_sampler) - - # Horovod: average metric values across workers. - test_loss = metric_average(test_loss, 'avg_loss') - test_accuracy = metric_average(test_accuracy, 'avg_accuracy') - - # Horovod: print output only on first rank. - if hvd.rank() == 0: + + def train(epoch): + + # Horovod: set epoch to sampler for shuffling. + # opt_params=[] + # opt_params.append({"params": layer.parameters(), "lr": lr}) + # optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), + # momentum=args.momentum) + optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), + momentum=args.momentum) + optimizer1 = optim.SGD(model1.parameters(), lr=args.lr * hvd.size(), + momentum=args.momentum) + compression = Allgather(TopKCompressor(0.01), ResidualMemory(), hvd.size()) + if(hvd.rank()%2==0): + + optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), + momentum=args.momentum) + optimizer = hvd.DistributedOptimizer(optimizer, model.named_parameters(),grace=grc,process_set=even_set,op=hvd.Adasum if args.use_adasum else hvd.Average,gradient_predivide_factor=args.gradient_predivide_factor)#hvd.Compression.fp16) + model.train() + else: + + optimizer1 = optim.SGD(model1.parameters(), lr=args.lr * hvd.size(), + momentum=args.momentum) + optimizer1 = hvd.DistributedOptimizer(optimizer1, model1.named_parameters(),grace=grc,process_set=odd_set,op=hvd.Adasum if args.use_adasum else hvd.Average,gradient_predivide_factor=args.gradient_predivide_factor)#hvd.Compression.fp16) + model1.train() + + train_sampler.set_epoch(epoch) + for batch_idx, (data, target) in enumerate(train_loader): + if args.cuda: + data, target = data.cuda(), target.cuda() + + + if(hvd.rank()%2==0): + optimizer.zero_grad() + output = model(data) + else : + optimizer1.zero_grad() + output = model1(data) + loss = F.nll_loss(output, target) + loss.backward() + if(hvd.rank()%2==0): + optimizer.step() + else : + optimizer1.step() + + if batch_idx % args.log_interval == 0: + # Horovod: use train_sampler to determine the number of examples in + # this worker's partition. + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) + + + def metric_average(val, name): + tensor = torch.tensor(val)#.clone().detach().requires_grad_(True) + # tensor1 = torch.tensor(val).clone().detach().requires_grad_(True) + # avg_tensor1 = hvd.allreduce(tensor, name=name,process_set=even_set) + if(hvd.rank()%2==0): + # print("EVEN reduce") + avg_tensor = hvd.allreduce(tensor, name=name,process_set=even_set) + # print("EVEN reduce completed") + return avg_tensor.item() + else: + # print("ODD reduce") + + avg_tensor1 = hvd.allreduce(tensor, name=name,process_set=odd_set) + # print("ODD reduce completed") + return avg_tensor1.item() + # + # print(avg_tensor1.item(),avg_tensor.item()) + + + + + def test(): + if(hvd.rank()%2==0): + model.eval() + else: + model1.eval() + test_loss = 0. + test_accuracy = 0. + for data, target in test_loader: + if args.cuda: + data, target = data.cuda(), target.cuda() + if(hvd.rank()%2==0): + output = model(data) + else : + output = model1(data) + # sum up batch loss + test_loss += F.nll_loss(output, target, size_average=False).item() + # get the index of the max log-probability + pred = output.data.max(1, keepdim=True)[1] + test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() + + # Horovod: use test_sampler to determine the number of examples in + # this worker's partition. + test_loss /= len(test_sampler) + test_accuracy /= len(test_sampler) + + # Horovod: average metric values across workers. + + # if(hvd.rank()%2==0): + test_loss = metric_average(test_loss, 'avg_loss') + test_accuracy = metric_average(test_accuracy, 'avg_accuracy') + # else: + # test_loss = metric_average(test_loss, 'avg_loss1') + # test_accuracy = metric_average(test_accuracy, 'avg_accuracy1') + + # Horovod: print output only on first rank. + # if hvd.rank() == 0: print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( - test_loss, 100. * test_accuracy)) + test_loss, 100. * test_accuracy)) + import time + time.sleep(5) + + for epoch in range(1, args.epochs + 1): + train(epoch) + # if(hvd.rank()%2==0): + test() + return "1" +if __name__ == '__main__': + args = parser.parse_args() + args.cuda = not args.no_cuda and torch.cuda.is_available() -for epoch in range(1, args.epochs + 1): - train(epoch) - test() + if args.num_proc: + # run training through horovod.run + print('Running training through horovod.run') + horovod.run(mainf, + args=(args,), + np=args.num_proc, + hosts=args.hosts, + use_gloo=args.communication == 'gloo', + use_mpi=args.communication == 'mpi') + else: + # this is running via horovodrun + main(args) \ No newline at end of file diff --git a/grace_dl/.DS_Store b/grace_dl/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..91ecb82bbae644c2e3ee893ba7ffa04285e9d005 GIT binary patch literal 6148 zcmeHK&5qMB5FU54Y`QC;2N3jvjP_cIhV5#FxP($JoDe~9;HNa3vW-aNs!7SAs+2n% z5E8F}xbgz~1c)nV9)LSDwkr~vg#%Lgkw>yW$1~%xze(&E09bbxH~8Y$pP4|#9qcNf*$2%0`$%>`vFbo(5{zeAmhYtaSkU(1celLA~gA=xvy0Pf>{7CrN zXGc$lU(Pg-ZFJ&jk-uyY{QKcB>o?Bb6mdET&xW@7gND+A14=7DhmF^1XNp;XlL&+ zK#|fP$R_Q2M8CfF&U(U`D24&Uz(2$Q?GGFap{p@hD7Ox*9ciZadg&lEKjkJkX^1&x?rV`W7(ji_#zf5 asN>W?bT#G*u?NNc2uK=CWf)j11K$8iLh70T literal 0 HcmV?d00001 diff --git a/grace_dl/__pycache__/__init__.cpython-310.pyc b/grace_dl/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32999e2685154b35045ca1d132f2d922c9b72962 GIT binary patch literal 185 zcmd1j<>g`kf}m%OX`(>-F^Gc<7=auIATH(r5-AK(3@MDk44O<;%!YafewvK8xZ~r? zQj3Z+^Yh~4S27ea0abyCU!MA*#Xt%D^u(gF%#x(Uy!1qUm;Ca)oczR;VtvP=%#xx+ zm(r4wM1A)lM`u_4^rFP%R0tWLlA|9VpP83g5+AQuPM7G83zDZZ|rpd_5r}aF0i?S!;&WK zqA#=!y>c6oVLZSXA_(9V97q?uX@M9Z28aP-fEXYK{sji`nJt=8@!VHd8i@g7;J;)* z+#gKr0xgZ1LV4@JL6!iBHC&br=U4~G7*nIAF;gf*ai6MsU^I52%=rYjUCkB)JsbO&ZBq>&gP1{N6*wfjx1*f|Vw%h&H55_d9+-H^NJ^TmzB zV3fAo-`U1y?Z#H!s2e-R8~dD3?9@reqn?wz5}%%P9=ca^$9)m_lRz;mU^?v{c3WoW_;A)Tr=9lkgO=HScr=?ej63)4 zKYBKJ7Y-wSsd8faz5u(W96zi|Fg)wnr+gTznBBXpe22W>*JK!Q;s3aG@a<%^zxPM} z@p=AS+TxTdg}JbkDCE%@`iQ)@wSX}s;KK;FH4<-Waz)`jZC8z^0n-i~9db_51%^1WkznVqo1EV70#8?_pL~zoE+sidx&m{uaAv p!Y@-OLvRo-VqKQwi1R-Tac*jxr7=^87Hsq)K+-@1G4NLz_yuOvJM91f literal 0 HcmV?d00001 diff --git a/grace_dl/tensorflow/__pycache__/__init__.cpython-310.pyc b/grace_dl/tensorflow/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b82ff45ea0bc4292b6ca8a6e5707beaa0f63070c GIT binary patch literal 4998 zcmb6dO>ZPewW_;krl&o2b~l@dRT6EJ1j2wBX9GmStQId`60NitiFJY)CF*T=)r{Ne z>7K3Xu{W(TB6!6Oap1t!KKd8%S5R|ALgLIJ5(nVDs{Wd>y;0h#=lXp0>V3aSR#zPY z->?4p#pt&$5%LclG(QUlThQbq0E{pikqI5ri7_-Nj*ZA1S|1W-GV3{E7N;lXum$55 zYs0v$#%&mPm%ZAcOAH85j^&4alr5^;zij|1DBKDCJ?$IW-<5WPM!Va|1Rf#%l)LFZG$jG| zC`|X&xFUDpR7W`(Zn>$|f!d^+Q9$dpl}`=KnbEFaLgHx=GsCIncHD^t8V z88t4ij|ra6L<|QsQ9(s2(*#N|GtOMaURfQ>Zr2;#BVW34lGe7$+2EY!YdcB$ury!J z&duDCJR0R~eN}GZT>4xb=&#p0`0qb7V0aO%jUS)3O{YWl`l=6Boy2ib*GO2K7ZadmR ziQAVkzdZLNSkw;LUSNH>{*|o%Yeme1O4Gg;m$gC`kG+bg@=C8*U0?W9<@HL|74flr zOw;RSw;N2o(r=w>EzUcIaKt;8Fw?0q{S=pf03ds1yRy?ky&j<7DnIO-(6*te93>?O z^wgL`o(iT;vp-aCyFVTaKE`ObtiJ_aFW}pvz8Cu|%(XU_SCocOZ2?QZ4ix6r#83D0 zmd}_M_YG~`s6NYv-StaU5Ak96%3deC+2BXDVW)5G?)JN)k7&M&;Cl$x5ujVp-uON2 zy^r9>2x>o~ddejaU;$56F4?gR3mkISp}nra)d1SUhn(MnCehEAE}5HHTc}FL(uLNr zaLYre5wJ4Id+?A6Sc)tV?TRBhUE>V#VNi)H-ksgX(gcgKi(uz`9a!5JZ z4X8um6#E*^aZf$6%iNh4d6axlge$x|B0R~KEb;~Yk&OHYSV_)C)P^U; zY_`bLNn92R=IuDklDNE(YJw)agRLj}QxIgmW#oJyQ!1!%?t+L3Cfqv`xgUs2A`iCl zZ2NfY4x`=-l#$S-eh#c8OR}Pd8t2}j#4tm+BM7*lXCr1ky_id71J2VtEjhQHn?hxR z>-1KRTJjV%;rjJT`23%2VUP>$E(N z^CTXh7B_iT#6pjcRa&a}bWxV^_&X_1iY)%(?GGo%Z^wy>C$iyOWaCoiu+uavZpN_j zwuIwt{f_4El@I&3F-J_cM?3U(ur0{ZCY#BouwX+DfpEnMkUpn`B&1?ME|6#FDf#8I z$~SC9O1g4@nDOu701|rd+*7EWTO%%NagJ67&K&WoWjsJV(^}TpRRAdX5>WFBDt_6z z4AdLL5(;vANU8=8YP_Ahu#zol)fUdyqEmJZ1>GJ*7nXatZiz9zsk)|Z_Dpw3kPG&V z49V}#izb&|8J1n+8F%T_)~|kcs(m28>YI+)2de{f06hTGGg1vqugTWOF^>I$P@^-K@@r+Q+-6){gR|Es7*_Dy(Ct0Bslef5Vysr?D`ml;Bjxn~dm!cb6c z@_=k~VEuSLm$|S`Ikhe5s*89oq0yk>pc`?b@YJk7DRQ_v3XjYlHa(&1zT~lzT*ot6 zH=$*j%&n)O+oWyFd;$8nl$PD5bK4Pe)4Op0l0-?kfzr0(YcNXW=uiddzO{C5*`$0I zg}a+Y-R%rxi#nTWoLIk+A9Y`f=wO3mLOe@#8SVd11?#*t7xJBD5NkAij8F=`wu~^H zU(={t|2A4nQ-w`zX>93vG-$xx_0eFn)izh5y0D~SJzN}WC_YzmBGYS$nEo1=OYsZn zOWLPBppXVm2oi870HjBs1Ahq?A@C0E0UA0_>Fv|UTes_YI1T>#szcDS>9UA($(!Rsc6Z?KIuwhe|5(jof~F#@)* literal 0 HcmV?d00001 diff --git a/grace_dl/tensorflow/communicator/allgather.py b/grace_dl/tensorflow/communicator/allgather.py index f3904e3..4bdba0e 100644 --- a/grace_dl/tensorflow/communicator/allgather.py +++ b/grace_dl/tensorflow/communicator/allgather.py @@ -1,6 +1,6 @@ import tensorflow as tf -from grace_dl.tensorflow import Communicator +from grace.grace_dl.tensorflow import Communicator from horovod.tensorflow import allgather diff --git a/grace_dl/tensorflow/compressor/__pycache__/topk.cpython-310.pyc b/grace_dl/tensorflow/compressor/__pycache__/topk.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1b1d59c597c9367fbb0961fdbec60d807eee2fd GIT binary patch literal 1969 zcmZ`)OK&7K5cZ>AGn<4V8(0O<(gy?@U^5bX;eZgUL<#MK2UR~`H7{C4b^XT`8kY7=` zJpxd^g2#LgLJ~f8xb7q-jmN%@S8|0jxvpKiH zL3j|Myo?)iMH`N>{K8+bc`y%caG5lu39r~ZYN93_F^D7F)f9YRqj(C@L=c8mvfpu? zv9g)j7gOb8VY0F?*71oralELkV|L_%iLm=FwDmMQbX@Ei7mZ;>>3-}Yt158i*7D`; zGA(RTyI7TKQq3lg9y*%2xTs{2D>!*1%9%2+h<*zq`6D0vV3an4vCv0_-4oSV40h_{ zs;q@HgD1MMTI|fM6@#ZrZr7!F^z6IsC(j0BEpoMxnJfp3AVy_s zE!)-TpwD#&CU;MrOhqL#Sxk)Unp{|`bXLiJpzp)J-tb0?kTX-9DZPfvn2&{c8x6Wr zwPEA7!Pr-5j|Nn~3$HmmcSd^!^C>JVU#dg)l8mUNl3gT2Um~~5^(6_&FoN)7xAwfA ze)~V=iKU7U`ZpN5CFj04DW-vBOcL-xcl+FhW;RvYC0SNfh0QYk2AKDm3ydm9UU@}b zC(H6u7z5k>#&}u%5Cm!lli#IXdiLOMNL#JXJ5V8%cw!rm9)a*s(JKlRL7?F7027d+ z2`)KMDga6`pj2qXCTPe(bk2Y>(L4t0xB+>PfHi8JCR%DRUv_~;@fc_byGg^5Qu9ue zjA_&H)%^cLAaz8Fdn(3v7|JD=qt`=!v(njvW392$Dg@)udrT^^16be z>38+}XpZQ+pb``1R@$OLkHfX4HY%~gD&0eCM97z!dvm$n5<*GGb9?f7oh9x5CY&_u zAi8u7>a54s7|zLmq0yeXg9aA=4I1Bo($E(RB(OmN4$Jn0$OMpG?vZH`2pYsBKIiiU zG9ETb6Zvc>NU*aXcU1e$bni4B6=hje<5W$i_B1tRZB2Sy*!>j9RGSpomoD?3?uq;` zof))-mX+4J7#Ec&Q;*6uR(3yX5u29KZ$5%07eg-*fQsju{s65y9uBv@NUO=(c$Cyg zIsFNWoAm3CQGEx5yT1fN-dq8W{G08qzYpPzLC_=C10)i1Ri~Mc+ro81PH07;Z`>He*;k<>puVh literal 0 HcmV?d00001 diff --git a/grace_dl/tensorflow/compressor/topk.py b/grace_dl/tensorflow/compressor/topk.py index 9de37a7..773cd77 100644 --- a/grace_dl/tensorflow/compressor/topk.py +++ b/grace_dl/tensorflow/compressor/topk.py @@ -1,6 +1,6 @@ import tensorflow as tf -from grace_dl.tensorflow import Compressor +from grace.grace_dl.tensorflow import Compressor def sparsify(tensor, compress_ratio): diff --git a/grace_dl/tensorflow/memory/__pycache__/residual.cpython-310.pyc b/grace_dl/tensorflow/memory/__pycache__/residual.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ec9befc0e9b091bf828b0285ada15719b3b19f0 GIT binary patch literal 1295 zcmZuwK~EDw6rS1HZnq5}LJ{$x#6)9F?8ZA$2n0PKCKx<4F_WP)EnBx;nAw6f+lzAb zHwZ`ngg<1i9*j57Ui7_LDup=Be0^``&3o^A?`=9gT_v!-c^}(fd_umXGP&T4tim>r z03?w#Ax#SZUgGswlTq@5NGAPLB7Ma^v8FHS8&WU*1Z%Qgr#5({dZ|8Xp+317D66o| zLjdJ`Ha%I9o@A%A2_9wXlz_KDmf^o7L-1O)Gds$}av+kz>koJf>(}!Zx*IP{xdeM1 zws{Eo|~$N1XV z@a=_Yt3#zz!;|>EvVLFmRNc3}QAt~ufq)JHEIs;4^c+aPThXpj+C&|pkK$}k^gANj zNRRtTDx`_lbPT~Z23aPeO(oaUL_FDgy}q^;b+l-y0=Z10O!ZBw+evyH^<0;tF`34} zi48fA`*FrOW?-fO0*b#`dY}4#hKpBQ(HL7_X$7_gw`;{+5L`(~7uv8yYN&ZRt&v4C zqB+hD9X{ASkRnsd**8aOjmH5jLJaCkvsG9 z@;M!adDtn4Eld!bBx0*lVL&)D8pGK@7!!B;^@^jU7vRkCDF~ols_o5!C?7vn(y~_e z0p`pVjbA#2n{XV^RtB!TJUzHxouIf`glGoGZORLw)d3wY>|FWg8in0<-P2eM2lW7_ z1;>_iTjji$%0Yti6z7NVwH7zJhW57*?jc;Z$4S^Djl+UPa4CeOO2hJ0#hunmwpwt* mfz{XyyW|GC;SdJ8F>&vWz5OqY4Ofa3pHYm30Ye~MV}AjyHAR&G literal 0 HcmV?d00001 diff --git a/grace_dl/tensorflow/memory/residual.py b/grace_dl/tensorflow/memory/residual.py index ccd4dc9..00347ce 100644 --- a/grace_dl/tensorflow/memory/residual.py +++ b/grace_dl/tensorflow/memory/residual.py @@ -1,6 +1,6 @@ import tensorflow as tf -from grace_dl.tensorflow import Memory +from grace.grace_dl.tensorflow import Memory class ResidualMemory(Memory): @@ -8,8 +8,8 @@ def __init__(self, beta=1.0, gamma=1.0): self.residuals = {} self.beta = beta self.gamma = gamma - for v in tf.trainable_variables(): - self.residuals[v.name] = tf.Variable(tf.zeros_like(v), trainable=False) + for v in tf.compat.v1.trainable_variables(): + self.residuals[v.name] = tf.compat.v1.Variable(tf.zeros_like(v), trainable=False) def compensate(self, tensor, name): """Update the tensor with the residuals.""" diff --git a/grace_dl/torch/.DS_Store b/grace_dl/torch/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..2ecd6ce1255a2c865a79a3e17165e33d226de615 GIT binary patch literal 6148 zcmeHKL2uJA6n@^ymTm)r1QNSN;#!H8b(#>DP|6(<1P4GRX`(G6iK8T?LRF>Q*&o_< zhy4V&a)e)ixbdFtZ3%6LxJ(TBMfT^|-*evkQpYs_u))Od0hj<_U?FU6V70_(oXUc& zsGeJhOw2KX1VR{t3&jFkcW4E)0{@xm2Z@Q@F(>AGq0Tx_iNSl;6=nDV5_VnTDI8*>F+o2cM;-=DEA0i2`J*XUF5+EdTD_0*XqDN_DH7TCcqXMbkC`DT{U;$l;S=Kb~O@b9{& z{ezQgGhwm?UG9Kj8Y5#ZAOjNU1D$9;X-sFvg~kj{&W(WqbCcCzUQ=@u<`%2NysqXo zX76e3#=mfq*49&F`-fh-Z;9v$rg0!fZ*DIFm0f7gAt}*Xy;^L zGJ}~H+Mvd2aJH3NzwjUurylMr8jf}7J?QUj5Ha|1PW~WmJ$d%(h`BL$;zQ^>l#y^w z{P-0*X}FHh3t-8aqPzjZa8wgJJCrvj$9_Y+rkG-b|!J`c7Noo7meM` z-Iu-X-R=mW;+dpu+>ImQ9d;@8LqDdpb99QFE$ut6EES6B*M}Z($1l}dkFYFR2 zU~4*Ql9s8ydhlVc;b~jY<+mUT)-f_LSWEE@c*lWp%u*b*U|eUm8rNBaHGz*dTL1(X z(`GLRAQH)UFMumz2oZFKz_S7xKO8x3$Q;J!zBnWQn1?wjJNtJtdKqk-lBq_usSXk{ zWKX*yeaNu7d(4G9;^|rz4N0Y+aOw(9B>>tsvIu?z2Z)bRl(4Dj0Pq>JRzUdr&w{Pc>-w&RN5^zap|~4s-fG0#}g4s$B)a*^CCd%P3(l6GtQhis(=FvJIUh$ zs#~*BAfE+>`G!ZuzI9Z70AkLYLi=8CHg{&hy0vs6PO0}Wp^AoM>^#FEImb%_L1-H=6`IBsOY2=yMH^-hwWlgD4Wa`kcU6Jt*+B zzm(QsUVo%6q+Y`F3?^r+;(`P-flMcN`I70g&v zD~_v_Y?h~dhU%vvk_S~wI^~M@8};dl3onP@9*1Icw^>(NrAX_Ckg;PfZ>46dde)aS z;~vf#S6y?rc^~5LhaiA<7~Iqf)U#f|gWrU@qU4351oEeY%EurQ%!^rsZZB0>Dd4{+ z%Zs03Fx6gbmjvpm{#HMOs`^^LG_u)m`k8TND!qwcP&YNE=Gz*9U6(Z&ser!#~53s`=g4h^Iel`%j18nhDM(Iv7%)=7)l zBw3&Hf41}n?pWD-xF693SMWIjR|*%1Xv25$If{9_(^gI$Wx_Xj1oFN`lKH4tSM_hC zf0poy&gvpn1ir%CHs`NYm#a8G0|oXx_3{2Nkra#>q8mKRpn{|9yqfN1~# literal 0 HcmV?d00001 diff --git a/grace_dl/torch/__pycache__/helper.cpython-310.pyc b/grace_dl/torch/__pycache__/helper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8541fdca3fd37d5a718b4d7e330c741dceab7ce3 GIT binary patch literal 3310 zcmbuBOKcle6ox&vPj+oA4PAFzU429IlGbk^Ga%mvndM25@=_t$&N@K0 zxhc0zkw1d$1j#^l@ivk6(je0t5qS9)S~oNat%qks*2_T4DD5${7HIE5>*ejD!^=X; zD(!P{nh6LEF0;y|2j()Epld6W&1R zXD{0aKZEXoJE;5&f!(jzX|MXAKXWET#Xzy!o3q>9N7hrRc?-DcK z@KfugGn!?2%r)W;yM6A+M${X%uD}v8g5?q{5Aoe%k2eO(F|2#knum%=KwX78$M=ey zHy-*u%#OgB{Ca*u2E^;*d8Dn`t-cj&7jN+yr}B>5rkG;B(@zw+s3%(%slb zOc5s)L78xh9}!2r84xo~0&^v3uH5OxO)A*+SMt}ZrmW^mhFmK#}4;WRkTG#n$@Z;{kUaY zrk@a|u;pDp4Mo(fa?x<0B@ND{$!bNj5M!lcPV+MhrQ*4!aklPgSsG5+E*(+VPiG2A z^JcYNvZ^Hpqwez;me5(cxUj`E%4O-32XuXN#*g2u%pC`ry>MmjcoQnLT82}T28YzS z7&MzI@rzh|bEJ?!_eR-)(s9``&z7AnnqIN*;1Jb{x~@?Bi3Jtnw*xKv))V7?+wGcR zIW@6t)x_qz2uGn+8Wyv~Dy{0OYQC{$p4R5ZJD=Tlg4CLz6_QvJO-o-Fwm_6>qV&V} zbp3v&pgX3t5Jo!2u4`ytM0<-#(pfrAJmS^zy6MMCrc)TjPK)L;;{h>S--In^k?OLQ0zJJw z)5vd`Bywzfl2TjJ&v9XLdnn9vnR$UTDpr{#XkvS3K@rGy!lt&z>h&P;A+h5Qs4!$F zK|AaqtO?ok0xql$*-P9Gn+WPe_7k?lChGMe2MOFEV2whN!$j^7Fsc(d3drwXw4K*Q zg_~5CCOa>sEenHBov2WiqmpKfJP$ozSCN}|-qfLkMmQ!3(r+&ZpzFt0)|j8V zV@uAKRdL3t5d9c-=~GjvN|ZWSNgubW+$4PjtC~^;qY6T$yGmu1sED|dZ*7vl|eQ`%xVI5K**R=>hc*V5|m$!AIn`oU9Iz3uL4?Rj*4REJ74 zq#92>nu-Q<(r6mK10wrG4?Yza+C>9KkIGY6^kPm^PjzTpZYQDn`e|Xa8aH26d!$IG GBmV+olG%*_ literal 0 HcmV?d00001 diff --git a/grace_dl/torch/communicator/__pycache__/allgather.cpython-310.pyc b/grace_dl/torch/communicator/__pycache__/allgather.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d156fb4f48ce5566d2842c18318b1a343b978ba GIT binary patch literal 1961 zcmZux&2Aev5GMCutCgL&j^Q9hu}xDH0urldBPbF$NH0Np2zuGJSg<0sR>*%k5kU#Nf zdpPj(1g3fnf)h@2lHk|MX<;Q6McvNr!bzOMP29puJW9SL+~Mvu;jXZ*?8N8v73q5) zzyo>NrzSe7ieg%hhpet--!>hV=NYR{g*3ZsHD&6e9GWi5Q(2YcOEE+T+rwV|sE42k zl2A?(>m3MH{P{)a`)gg$QT#-4| zR9g$Ut+q8uElysM8~PGbchz(PX$NUKF2{A6n((}mIZqX&+joqkL_RX%uqq}}C{;=0 z6{4u*g~Y1gP}51%Va}9F(;M=)7<{jURD+DkvvK`1D>F8DR-Ko5#kd+gmE&5nXVbc7 zgXe;uR5^R};=7ZlF9w-pL($5V=L0C}@N@t%*EJrjt9mrKkR1pOP!oms+@T@8+*{Y; zXmyd{sC+MrttGIO;( zx3#;Vl)QB`dF{M*ZvS` z<5O1hTqyLsMeT>nh%pjJ#c({9tbDURE-6&~!#^#VZfFgw6;?tb-u*~+adU6waZ@kj z;N_EOwS%$br(m3Pny`Fae4XrW@1`s>OY`P=C=;~R65RO9s+imDi7jA^*Y3$%;Jg*L zF{iNU52=D{2ftKQ77srDL*x^oL?YHNQ%8}fCLoM7+1Ct$rZ3z775SBKa| z8k$;5n%-}AJlz4xN2tH{ zn`W65P+DPpb_P7mTI~D17EBL1(KN4(J*AwTlU?dfJd8d<+Xz*NHB>! zz{)G^l>q@(Ik)aLtNlRUg*CI^G~S~|LgPJZo|Uw4t9M$->a60+HGE>ti)!&h*IO|g VW;dh`9=GX#gU@{x*xIN2_P=v763+kt literal 0 HcmV?d00001 diff --git a/grace_dl/torch/communicator/__pycache__/allreduce.cpython-310.pyc b/grace_dl/torch/communicator/__pycache__/allreduce.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c2ae81f54efcf50c6db69906863060720c12274 GIT binary patch literal 1111 zcmZuvy>HV%6u%G0P1B_6M+6dLU^NnPHyEmlDithM87dZw$~t`4#Fgz!@6IWWoGJYW zkdRp0k$>q{CUzz!-r1xLh?Cy)^WFRW{hsXgItb3suV2nz5`@0HpVi~R&ub992Z|$( zGnC^9V`r0$-k5t>tjD!jl)=xp}`nS1kAMQ^boRjWc5^unu%X;t0=KSm3B2P0ql6@{Vj zN#NUcQIxq*%n0kV$wU-fxoWnP$b6!N)>7FuHU*E7uv8lbF`gFLx!p0MfL9$aJq5R3 z!E#~O#;o9(&=#k5yYX6a`i9gtRQM&r_gW|&j+mOJ<|8XcEIg7|MJ5^7;h{>6Vn?Mh zEPNyQP-g7K@w?&SaX3;e5si*{76K2+ID{VVX$kK+8BDHir{P7zWNQwxHYg8o;b+*x zv;IBCg9o?2;C6fCq=wtE*05Cp*b6+T05H166|Kk}0Z8ddg288b9NjMX6h!wyjZlTo z{5fv;`b;h%s752e$~nSDk-?tD=}?A9TUZ6}U%$;-^y-8jB1S=PL$U045!Q*I|#l$%<$KsnKF>D#DHx_*7< z7PnvT!JyW#2ZMSKl*7gL#!|_t+FwAuIyN# z4uuTl+&6HJIrepUEl{AZ&{JntT3c;OfU~pwnHhfHM^-yKeFE)=KYu;@oe}Z}{%kiG zBi}$b&%j6`X-*27P>Q~ndxf9)1xpwu-x29cc0(jnbmb?33{FWD{sp^8MBU)1DvEiT zrJ}BM{aTb05g%7qWe$*-__@w%Esp1PE#hyLJgRc>^~>*%p1+JITBK@i zxy)lgGM&bd^A6bf4%PAOT6Z8dc7x{}(mq{0yu)JL9wcqCg*60=ML8TnH@|?HkfmP} zOU~(v9+Ov}FByykwP)A=`#*cjaM`iJtzR?SvCMjPVA+c9gHZMf_#yf$|2+uiV%Pe& zz6~0`V|{?uJ8_gpp$jx%pc<@PPnGjRX;CW|irGw+Qe%nEH?@wsx`+P5T9p8@;Vs}w zI#!B8xz1FSGFQg+bXCuMs1##LApy;2Z6r#7y0|yyDg>FV*CP3h;a7lqkpzSyB!aH4` zHKH09!gch<&m5h8gu+3dnHn)Z+{a)Tc|;ap-kdEz`YN^tXaG)n96Rjr5az|VO-5dHh3~S8i{Iq_Xj9RYmLnxV`V%}u>F)n+wq4Qr0 zT8{7X$p4;huw6Xjv|+gmJP(260ewOjPyYX`@UFm9fQrMXfb*h~^BnyF=NEI4uVcFDl&vb|n2dJ=`nuz3OC?vIzL#<^=pnGH?_Xzk1fe?{?3xe3TUhEPIF2~Z z(bVoF$4fFL7`;Q>buaH)!f}p9io9yaSkBvv>SqG~VK^U$y6m<#{yAOQy489#v&l=x8n(&qSeB zq|0bDJ~+?xFtrt1^Wb3D1eS`x)p&t6q$Ue2`!&8L+FSVF@CMhoCNlyI@cPA3`MF4$ zvZxm#(xQOAb_&FS3AnzlgxSv&Z5dI=$oMW1Xi(z9bX{^Zjy>ZmknNC)>xh}Td5)_T~!Os!Jd` z!JTf6>)S7S+vGmPu16nw>|r0Tpa0Y2_U&ohF(IV|TM9}|kJ4qySGhg+DZN~={7w SeBJPQgDt2<4pE;x@qPpGH1|(-C#u?ZHc=RzhXlE#>MMT z1dCfR>=$4ZQM4i>oLNO{He!@0uA--8#P1RntK=zBiDoB!6e;>M>8CH@D{|1MzPI1j zGo!6-O~G(;^CU3Cy<{JT{T@t5PRSWr;Y?v>jw$MR$2z)4PUs0+aTguMuqH=|V$Yc( z9sh%UKvq#lx(MdDi@JEi;C6oSlER5_d!qDlS?GT1(@7KZ+14k=vYKoAf|$$j>R&nh z#cE@RQ)!M%H!oE(0q)qY#a&+q-XfBW8WYGk3;Hdoa! zL^a&r-I^iB*Vc~)vxU#>Od4BG78r}Y04Aduy-Kfx&8WEw8wHFv9==%xLNystMMePe zh(99}s+i(WGaxTgF^~#`subqr8Hva!1xPc0<$k#ThjTuc%})z=*fydn$3~h`Tk);5 zT8XWT7%v1MI%!Q^HdC<)x-1aJMtoKd^x&qj8eoLCQUrKKI}z?s3t70iti+lFVauf+ zyrCb|5)Mqu-T>psNl*?blXfHr-GJqaK=ACq1U*DPwsQ!`XL(*WrOR`_JFVKWta9u0 z%pj0`<|C`CNm#s~ekae1N?M!efn9S2e9XIG5M+z^_h>?Uba~?~d11Vwu*w}>1ldKI(nL& zkqKWVT|A{-63QT2J_IF-4N2?pUgPTi!Cm26u~it7HPwwq76KhrX&th$kvaN~wE*!p zwl1NR)WAn`H~9+6EYnTVB8vty?eBPQ=e3W=3#YAl54Ow>7{6P~21w1V)+&fJfV?iK z2r2HT=6&?uMenGQwf6MTT*9q)aEll2$(jbNw7`YWwh7>up%(V|B_QN7y~_TdmhA^` z(2~K|$mTwtatlny9`F?pAkznQl{j)lp~ECc3D1W6Uf7SiUxZcv<)adr%mpo<5EBFX8~=W zu8Yi?kI*xq9eP|GZd)UNZS9k*R~Z3?frN7=dgm|kB#VDAJ(fUMLOzxszFi>aA3%eF zqD01_D0_Kcw`yMD?+bbUJ7{L}Ce3rz7Ep^=f#zd0H^KOen<_1IRc$LZ1a=)=Ld{-- zUnrswRz@Ovfo1XKU4#4mIKVdA3q5!cEcD<(=)nft%v!GK`&!oLsb^oW1O5j4j72SF K1kUGsJO2UMM(H5{ literal 0 HcmV?d00001 diff --git a/grace_dl/torch/compressor/__pycache__/topk.cpython-310.pyc b/grace_dl/torch/compressor/__pycache__/topk.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c51e8cc84c04a7651995e674f5d74b0adaecdfb GIT binary patch literal 1654 zcmZ`(&2Qv16!%AxNk?t9uw{1XQph6i1c3ko}caae*QhzTU&hs-{Zf2AN>&!@+TUv zf8ZnEf|##BNg`=UhWKYCovw2MHv78O#CGMJQPdo+YQG4FFeum~t0NJ8P7SN-?Xa z)NLk=}rvU3j5%ZS(^e>xZZwp*J=;Vz09_biEEyqhp+Y6V)3SbHDRvQmqSy zyF)ZPwSzHzsn(TmgKcn&*w-CU{T>)O$gPjBEb}=G6d6)UhU^6y0oNqEih<1?tAiD* z!xtnV!w4$wx$Ph8=KKF)F)|tj_cxfEC6{DDmjv*lEys0y29 z8dFc13ydm9-ngc2Bg^tq7z5M3X53PL2ns4IT@VV8{<+%@kyM}Gi;Y2ex8lk)ciNfs9>4fRJbtG7V8pb`_M-@zfz z?en`=6t8a&BMj1RlimYb{wMWy+^y6x{wDQz!D!j4-LV7?1wgAWC^S7jHuU1>{TsF< zN3 lx0zk6E$h$n?_PQHYtpz_;&Q;J=YTIG_lEn`N$_ zWs_P?OB`=z*>6)(b~Bz6`cpjgGf-}8#hYA}<(fO5Kp$g>XVLpG!Z)Go7P86W;9jWF y3zhk%-SO|vuGjGH?D}^{BfUF1U}yOSuN8X3QWt;SrTPs%wB2KbhwwPw>HP!fCy0pv literal 0 HcmV?d00001 diff --git a/grace_dl/torch/compressor/fp16.py b/grace_dl/torch/compressor/fp16.py index db6fe4a..5021ae6 100644 --- a/grace_dl/torch/compressor/fp16.py +++ b/grace_dl/torch/compressor/fp16.py @@ -6,17 +6,17 @@ class FP16Compressor(Compressor): """Compress all floating point gradients to 16-bit.""" - def compress(self, tensor, name): + def compress(self, tensor): """Downcasts the tensor to 16-bit.""" dtype = tensor.dtype if dtype.is_floating_point: # Only allow compression from other floating point types tensor = tensor.type(torch.float16) - return [tensor], dtype + return tensor, dtype def decompress(self, tensors, dtype): """Upcasts the tensor to the initialization dtype.""" - tensor_decompressed, = tensors + tensor_decompressed = tensors if dtype.is_floating_point: tensor_decompressed = tensor_decompressed.type(dtype) return tensor_decompressed diff --git a/grace_dl/torch/compressor/randomk.py b/grace_dl/torch/compressor/randomk.py index 3be19de..c345956 100644 --- a/grace_dl/torch/compressor/randomk.py +++ b/grace_dl/torch/compressor/randomk.py @@ -1,6 +1,6 @@ import torch -from grace_dl.torch import Compressor +from grace.grace_dl.torch import Compressor def sparsify(tensor, compress_ratio): diff --git a/grace_dl/torch/compressor/topk.py b/grace_dl/torch/compressor/topk.py index 6d2a750..866d63a 100644 --- a/grace_dl/torch/compressor/topk.py +++ b/grace_dl/torch/compressor/topk.py @@ -1,6 +1,6 @@ import torch -from grace_dl.torch import Compressor +from grace.grace_dl.torch import Compressor def sparsify(tensor, compress_ratio): diff --git a/grace_dl/torch/helper.py b/grace_dl/torch/helper.py index a59a6b9..cb05989 100644 --- a/grace_dl/torch/helper.py +++ b/grace_dl/torch/helper.py @@ -1,4 +1,7 @@ def grace_from_params(params): + import sys + sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") + sys.path.append("/content/grace/") import horovod.torch as hvd world_size = hvd.size() comp = params.get('compressor', 'none') @@ -83,6 +86,7 @@ def grace_from_params(params): return Allreduce(compressor, memory) elif comm == 'allgather': from grace_dl.torch.communicator.allgather import Allgather + # print("compressor",compressor) return Allgather(compressor, memory, world_size) elif comm == 'broadcast': from grace_dl.torch.communicator.broadcast import Broadcast diff --git a/grace_dl/torch/memory/__pycache__/none.cpython-310.pyc b/grace_dl/torch/memory/__pycache__/none.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..313a364868eb37b48842ec348074fd9890bb0140 GIT binary patch literal 749 zcmZuvJ&zMH5VhBvWr=bpodg|48j3B5D>{VkloK>ZkU-aLW0@Ua5@~lGY=`3_8pQn> z{*qfHBx*V;X1p8;5+lv5$MgA~*B%{>8OodQpRb;B#=enrC^4(=2tB*y>VVHwfnFR?{vS^^4wnct<~V=nOg^^&$l6H`NF{5 zw))A%>-pJ*TsqyD!76BFu&!CjE)gMn#4_95#BpQ04K9a^j&R^lA|%gt5BFGS`;4Z* z=$+8{AJ{;!vGf4;9_AuJsR~C^9M!seXomTihU~iDXm!L}3jVKCET0VvJSW1^KQdtt#BWE?vM2xm literal 0 HcmV?d00001 diff --git a/grace_dl/torch/memory/__pycache__/residual.cpython-310.pyc b/grace_dl/torch/memory/__pycache__/residual.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..676a08aa863aa5f09d4c9691cbc0047043330a9d GIT binary patch literal 1093 zcmZuw%T5$Q6us52Y1#o_gNQ3)qHz{Z{{T@4L^p^DhQ!RmPSI5kO}eMCst07!tmGHm z>ym%*3u^1axHEC*%6qHnFrbyxsn@Mj_nf|6&d*1H?d|W+@4qIZ=mo0#3<-96TFf|sZLT_&NdepYuNQ;1PMbX0I$wj zDmHY5C;T%QdLj@V9DNZYX=E05q|U^I=MC!*yo>GE^OWe?MsrjE8`$+dgbZp(pk|1y zb`Tu_NXNF%T{EXH7`5^CWW|j?;zhyxuJN?Y->VMZS0TbVC^G2JyLeYirQ;D-M_IMU z$0HtZmdE3~y7oTc%*nL8%#tVSEWk#bDR)r{PjKH5DLypk)`K6W|QrCK@8s(rg|Gt>h`dG%d2hl2^4IOC0ZA+0|R+@QlK zEsI0sK~8ecYKfc&%wyB#Yg#TNQ&QYE@}26=s(FZ$+Wi8xSNmU`(*VC`a34-Pb;l}m zRO SMiZ_!**s%JOWa-Pxqkut8U-8x literal 0 HcmV?d00001 diff --git a/grace_dl/torch/memory/residual.py b/grace_dl/torch/memory/residual.py index a2ae61f..55819a6 100644 --- a/grace_dl/torch/memory/residual.py +++ b/grace_dl/torch/memory/residual.py @@ -1,4 +1,4 @@ -from grace_dl.torch import Memory +from grace.grace_dl.torch import Memory class ResidualMemory(Memory): diff --git a/patch_files/.DS_Store b/patch_files/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..695a77812ec27a7da4b88b14093c1910fbf7b2d9 GIT binary patch literal 6148 zcmeHK%}T>S5Z-O8O({YS3VK`cTCmlYB3?qQFJMFuDzzcS24l7)X>%xr-1UWg5ueAI z-3?gn!IOxcfypdKD*1b28xtEQyS?Z3GTWnoP z8Tq9>@~^{jKD2kwWt@$|c$}z&Fc?F~-Axz=GI!-H4pNotX@}LY8bkYFvFM(3TB6hM zEn8yIY4?v?qI=p~E*sX~;nCUU@F{wZ<%{N$1MNz743_W;#+S0@UJ^$#egsdMRYnsM z1H=F^Kn!dY1Lhd8nj2L$6-x{d13zH^_Xh!r=olJ9FYYaNr{#@t*CDWbL%m5FkIv{@L%3{rr>i90LIB&O!&E1pqcSLTeklH9~f3 zE3%v>jkk0P-_pZ z;rHgw`LoL&;WZwUQMp2J42NyVaQ_5S5SYafV#K^!Nlt%W^jp`l^2%j}!5ka>mMxrCG{tjZ64~gVK?zg#XyBTFW$_ z+`pi?6kjHHy+?&ol~pPJlQ&IbksRT^%Z!(q8oUt(G|hJ7MpieA8U_pl|1twqA2>Eb zS7V`2mJaOX69CadwGL4E$FNFstYFx_Bggx2_zH w?pluRBQ_H9D-_BQ?DTak2XqzR!zKl7oCb)l#zG-#P|S~jq``ECfj`Q?FQuBN!~g&Q literal 0 HcmV?d00001 diff --git a/patch_files/horovod/torch/.DS_Store b/patch_files/horovod/torch/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 input, output # We keep input in order to make sure it does not get garbage collected @@ -79,20 +104,25 @@ def _check_function(function_factory, tensor): def _allreduce_function_factory(tensor): + # print("tensor",tensor) + # print("tensor",tensor[0].type()) + # exit() return 'horovod_torch_allreduce_async_' + tensor.type().replace('.', '_') -def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor): +def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor, process_set: ProcessSet): # Set the divisor for reduced gradients to average when necessary if op == Average: if rocm_built(): # For ROCm, perform averaging at framework level - divisor = size() + divisor = process_set.size() op = Sum else: divisor = 1 elif op == Adasum: + if process_set != global_process_set: + raise NotImplementedError("Adasum does not support non-global process sets yet.") if tensor.device.type != 'cpu' and gpu_available('torch'): if nccl_built(): if not is_homogeneous(): @@ -120,7 +150,7 @@ def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor try: handle = getattr(mpi_lib, function)(tensor, output, divisor, name.encode() if name is not None else _NULL, op, - prescale_factor, postscale_factor) + prescale_factor, postscale_factor, process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tensor, output) @@ -128,7 +158,8 @@ def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor def allreduce_async(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs asynchronous averaging or summation of the input tensor over all the Horovod processes. The input tensor is not modified. @@ -143,13 +174,16 @@ def allreduce_async(tensor, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A name of the reduction operation. - op: The reduction operation to combine tensors across different - ranks. Defaults to Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the allreduce operation that can be used with `poll()` or @@ -157,30 +191,32 @@ def allreduce_async(tensor, average=None, name=None, op=None, """ op = handle_average_backwards_compatibility(op, average) output = tensor.new(tensor.shape) - return _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor) + return _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor, process_set) class HorovodAllreduce(torch.autograd.Function): """An autograd function that performs allreduce on a tensor.""" @staticmethod - def forward(ctx, tensor, average, name, op, prescale_factor, postscale_factor): + def forward(ctx, tensor, average, name, op, prescale_factor, postscale_factor, process_set): ctx.average = average ctx.op = op ctx.prescale_factor = prescale_factor ctx.postscale_factor = postscale_factor - handle = allreduce_async(tensor, average, name, op, prescale_factor, postscale_factor) + ctx.process_set = process_set + handle = allreduce_async(tensor, average, name, op, prescale_factor, postscale_factor, process_set) return synchronize(handle) @staticmethod def backward(ctx, grad_output): return allreduce(grad_output, average=ctx.average, op=ctx.op, prescale_factor=ctx.prescale_factor, - postscale_factor=ctx.postscale_factor), None, None, None, None, None + postscale_factor=ctx.postscale_factor, + process_set=ctx.process_set), None, None, None, None, None, None def allreduce(tensor, average=None, name=None, compression=Compression.none, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, process_set=global_process_set): """ A function that performs averaging or summation of the input tensor over all the Horovod processes. The input tensor is not modified. @@ -199,16 +235,19 @@ def allreduce(tensor, average=None, name=None, compression=Compression.none, op= average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A name of the reduction operation. compression: Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Defaults to not using compression. - op: The reduction operation to combine tensors across different ranks. Defaults + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, averaged or summed across all @@ -216,12 +255,14 @@ def allreduce(tensor, average=None, name=None, compression=Compression.none, op= """ tensor_compressed, ctx = compression.compress(tensor) summed_tensor_compressed = HorovodAllreduce.apply(tensor_compressed, average, name, op, - prescale_factor, postscale_factor) + prescale_factor, postscale_factor, + process_set) return compression.decompress(summed_tensor_compressed, ctx) def allreduce_async_(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs asynchronous in-place averaging or summation of the input tensor over all the Horovod processes. @@ -236,24 +277,28 @@ def allreduce_async_(tensor, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. Defaults to - Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the allreduce operation that can be used with `poll()` or `synchronize()`. """ op = handle_average_backwards_compatibility(op, average) - return _allreduce_async(tensor, tensor, name, op, prescale_factor, postscale_factor) + return _allreduce_async(tensor, tensor, name, op, prescale_factor, postscale_factor, process_set) def allreduce_(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs in-place averaging or summation of the input tensor over all the Horovod processes. @@ -268,19 +313,22 @@ def allreduce_(tensor, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. Defaults to - Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, averaged or summed across all processes. """ - handle = allreduce_async_(tensor, average, name, op, prescale_factor, postscale_factor) + handle = allreduce_async_(tensor, average, name, op, prescale_factor, postscale_factor, process_set) return synchronize(handle) @@ -288,16 +336,18 @@ def _grouped_allreduce_function_factory(tensor): return 'horovod_torch_grouped_allreduce_async_' + tensor.type().replace('.', '_') -def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor): +def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor, process_set: ProcessSet): # Set the divisor for reduced gradients to average when necessary if op == Average: if rocm_built(): # For ROCm, perform averaging at framework level - divisor = size() + divisor = process_set.size() op = Sum else: divisor = 1 elif op == Adasum: + if process_set != global_process_set: + raise NotImplementedError("Adasum does not support non-global process sets yet.") if tensors[0].device.type != 'cpu' and gpu_available('torch'): if nccl_built(): if not is_homogeneous(): @@ -325,7 +375,7 @@ def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postsc try: handle = getattr(mpi_lib, function)(tensors, outputs, divisor, name.encode() if name is not None else _NULL, op, - prescale_factor, postscale_factor) + prescale_factor, postscale_factor, process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tuple(tensors), tuple(outputs)) @@ -333,7 +383,8 @@ def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postsc def grouped_allreduce_async(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs asynchronous averaging or summation of the input tensor list over all the Horovod processes. The input tensors are not modified. @@ -350,13 +401,16 @@ def grouped_allreduce_async(tensors, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different - ranks. Defaults to Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the group allreduce operation that can be used with `poll()` or @@ -364,31 +418,34 @@ def grouped_allreduce_async(tensors, average=None, name=None, op=None, """ op = handle_average_backwards_compatibility(op, average) outputs = [t.new(t.shape) for t in tensors] - return _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor) + return _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor, process_set) class HorovodGroupedAllreduce(torch.autograd.Function): """An autograd function that performs allreduce on a list of tensors.""" @staticmethod - def forward(ctx, average, name, op, prescale_factor, postscale_factor, *tensors): + def forward(ctx, average, name, op, prescale_factor, postscale_factor, process_set: ProcessSet, *tensors): ctx.average = average ctx.op = op ctx.prescale_factor = prescale_factor ctx.postscale_factor = postscale_factor - handle = grouped_allreduce_async(list(tensors), average, name, op, prescale_factor, postscale_factor) + ctx.process_set = process_set + handle = grouped_allreduce_async(list(tensors), average, name, op, prescale_factor, postscale_factor, + process_set) return synchronize(handle) @staticmethod def backward(ctx, *grad_output): grad_reduced = grouped_allreduce(list(grad_output), average=ctx.average, op=ctx.op, prescale_factor=ctx.prescale_factor, - postscale_factor=ctx.postscale_factor) - return (None, None, None, None, None, *grad_reduced) + postscale_factor=ctx.postscale_factor, + process_set=ctx.process_set) + return (None, None, None, None, None, None, *grad_reduced) def grouped_allreduce(tensors, average=None, name=None, compression=Compression.none, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, process_set=global_process_set): """ A function that performs averaging or summation of the input tensor list over all the Horovod processes. The input tensors are not modified. @@ -409,16 +466,19 @@ def grouped_allreduce(tensors, average=None, name=None, compression=Compression. average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A base name to use for the group reduction operation. compression: Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Defaults to not using compression. - op: The reduction operation to combine tensors across different ranks. Defaults + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A list containing tensors of the same shape and type as in `tensors`, @@ -427,12 +487,13 @@ def grouped_allreduce(tensors, average=None, name=None, compression=Compression. tensors_compressed, ctxs = zip(*[compression.compress(t) for t in tensors]) summed_tensors_compressed = HorovodGroupedAllreduce.apply(average, name, op, prescale_factor, postscale_factor, - *tensors_compressed) + process_set, *tensors_compressed) return [compression.decompress(t, ctx) for t, ctx in zip(summed_tensors_compressed, ctxs)] def grouped_allreduce_async_(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs asynchronous in-place averaging or summation of the input tensors over all the Horovod processes. @@ -449,24 +510,28 @@ def grouped_allreduce_async_(tensors, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. Defaults to - Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the group allreduce operation that can be used with `poll()` or `synchronize()`. """ op = handle_average_backwards_compatibility(op, average) - return _grouped_allreduce_async(tensors, tensors, name, op, prescale_factor, postscale_factor) + return _grouped_allreduce_async(tensors, tensors, name, op, prescale_factor, postscale_factor, process_set) def grouped_allreduce_(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs in-place averaging or summation of the input tensors over all the Horovod processes. @@ -483,38 +548,66 @@ def grouped_allreduce_(tensors, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. Defaults to - Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A list containing tensors of the same shape and type as in `tensors`, averaged or summed across all processes. """ - handle = grouped_allreduce_async_(tensors, average, name, op, prescale_factor, postscale_factor) + handle = grouped_allreduce_async_(tensors, average, name, op, prescale_factor, postscale_factor, process_set) return synchronize(handle) +def sparse_allreduce_async(tensor, name, op, process_set=global_process_set): + # Allgather aggregates along the first dimension, so we need to transpose the + # indices to enforce correct concatenation behavior, then transpose back prior to + # constructing the new aggregated sparse gradient + t = tensor + indices_handle = allgather_async(t._indices().transpose(0, 1).contiguous(), name=f'{name}.indices', + process_set=process_set) + values_handle = allgather_async(t._values(), name=f'{name}.values', process_set=process_set) + + def handle(): + # We need to sync values handle firstly for torch nightly >= 10.0 + # Issue: https://github.com/horovod/horovod/issues/2961 + values = synchronize(values_handle) + indices = synchronize(indices_handle) + + values = (values / process_set.size()) if op == Average else values + + if indices.dim() == 0 or values.dim() == 0: + return t.new().resize_as_(t) + return t.new(indices.transpose(0, 1), values, t.size()) + + return handle + + def _allgather_function_factory(tensor): return 'horovod_torch_allgather_async_' + tensor.type().replace('.', '_') -def _allgather_async(tensor, output, name): +def _allgather_async(tensor, output, name, process_set: ProcessSet): function = _check_function(_allgather_function_factory, tensor) try: handle = getattr(mpi_lib, function)( - tensor, output, name.encode() if name is not None else _NULL) + tensor, output, name.encode() if name is not None else _NULL, + process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tensor, output) return handle -def allgather_async(tensor, name=None): +def allgather_async(tensor, name=None, process_set=global_process_set): """ A function that asynchronously concatenates the input tensor with the same input tensor on all other Horovod processes. The input tensor is not modified. @@ -526,44 +619,47 @@ def allgather_async(tensor, name=None): Arguments: tensor: A tensor to allgather. name: A name of the allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the allgather operation that can be used with `poll()` or `synchronize()`. """ output = tensor.new() - return _allgather_async(tensor, output, name) + return _allgather_async(tensor, output, name, process_set) class HorovodAllgather(torch.autograd.Function): """An autograd function that performs allgather on a tensor.""" @staticmethod - def forward(ctx, tensor, name): + def forward(ctx, tensor, name, process_set: ProcessSet): ctx.dim = tensor.shape[0] - handle = allgather_async(tensor, name) + ctx.process_set = process_set + handle = allgather_async(tensor, name, process_set) return synchronize(handle) @staticmethod def backward(ctx, grad_output): - grad_reduced = allreduce(grad_output, average=False) + grad_reduced = allreduce(grad_output, average=False, process_set=ctx.process_set)#average CHANGE dim_t = torch.IntTensor([ctx.dim]) - dim = allgather(dim_t).view(size()) + dim = allgather(dim_t, process_set=ctx.process_set).view(ctx.process_set.size()) - r = rank() + r = ctx.process_set.rank() offset = torch.sum(dim.narrow(0, 0, r)).item() if r != 0 else 0 - return grad_reduced.narrow(0, offset, ctx.dim), None + return grad_reduced.narrow(0, offset, ctx.dim), None, None -def allgather(tensor, name=None): +def allgather(tensor, name=None, process_set=global_process_set): """ A function that concatenates the input tensor with the same input tensor on all other Horovod processes. The input tensor is not modified. - The concatenation is done on the first dimension, so the input tensors on the - different processes must have the same rank and shape, except for the first - dimension, which is allowed to be different. + The concatenation is done on the first dimension, so the corresponding + input tensors on the different processes must have the same rank and + shape, except for the first dimension, which is allowed to be different. This acts as a thin wrapper around an autograd function. If your input tensor requires gradients, then callings this function will allow gradients @@ -572,6 +668,8 @@ def allgather(tensor, name=None): Arguments: tensor: A tensor to allgather. name: A name of the allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A tensor of the same type as `tensor`, concatenated on dimension zero @@ -579,25 +677,118 @@ def allgather(tensor, name=None): the first dimension, which may be greater and is the sum of all first dimensions of the tensors in different Horovod processes. """ - return HorovodAllgather.apply(tensor, name) + return HorovodAllgather.apply(tensor, name, process_set) + + +def _grouped_allgather_function_factory(tensor): + return 'horovod_torch_grouped_allgather_async_' + tensor.type().replace('.', '_') + + +def _grouped_allgather_async(tensors, outputs, name, process_set: ProcessSet): + function = _check_function(_grouped_allgather_function_factory, tensors[0]) + try: + handle = getattr(mpi_lib, function)( + tensors, outputs, name.encode() if name is not None else _NULL, + process_set.process_set_id) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tuple(tensors), tuple(outputs)) + return handle + + +def grouped_allgather_async(tensors, name=None, process_set=global_process_set): + """ + A function that asynchronously concatenates each input tensor with the corresponding + input tensor on all other Horovod processes for a list of input tensors. The input + tensors are not modified. + + The concatenation is done on the first dimension, so the input tensors on the + different processes must have the same rank and shape, except for the first + dimension, which is allowed to be different. + + Arguments: + tensors: A list of tensors to allgather. + name: A base name to use for the group allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A handle to the group allgather operation that can be used with `poll()` or + `synchronize()`. + """ + outputs = [t.new() for t in tensors] + return _grouped_allgather_async(tensors, outputs, name, process_set) + + +class HorovodGroupedAllgather(torch.autograd.Function): + """An autograd function that performs allgather on a list of tensors.""" + + @staticmethod + def forward(ctx, name, process_set: ProcessSet, *tensors): + ctx.dims = [t.shape[0] for t in tensors] + ctx.process_set = process_set + handle = grouped_allgather_async(list(tensors), name, process_set) + return synchronize(handle) + + @staticmethod + def backward(ctx, *grad_output): + grad_reduced = grouped_allreduce(list(grad_output), average=True, process_set=ctx.process_set) + + dim_ts = [torch.IntTensor([dim]) for dim in ctx.dims] + dims = [dt.view(ctx.process_set.size()) + for dt in grouped_allgather(dim_ts, process_set=ctx.process_set)] + + r = ctx.process_set.rank() + offsets = [torch.sum(dim.narrow(0, 0, r)).item() if r != 0 else 0 + for dim in dims] + result = [gr.narrow(0, offset, dim) + for gr, offset, dim in zip(grad_reduced, offsets, ctx.dims)] + return (None, None, *result) + + +def grouped_allgather(tensors, name=None, process_set=global_process_set): + """ + A function that concatenates each input tensor with the corresponding + input tensor on all other Horovod processes for a list of input tensors. + The input tensors are not modified. + + The concatenation is done on the first dimension, so the corresponding input + tensors on the different processes must have the same rank and shape, except + for the first dimension, which is allowed to be different. + + Arguments: + tensors: A list of tensors to allgather. + name: A base name to use for the group allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A list containing tensors of the same type as in `tensors`. Each tensor + is concatenated on dimension zero across all processes. Its shape is + identical to the corresponding input shape, expect for the first + dimension, which may be greater and is the sum of all first dimensions + of the corresponding tensor in different Horovod processes. + """ + return HorovodGroupedAllgather.apply(name, process_set, *tensors) def _broadcast_function_factory(tensor): return 'horovod_torch_broadcast_async_' + tensor.type().replace('.', '_') -def _broadcast_async(tensor, output, root_rank, name): +def _broadcast_async(tensor, output, root_rank, name, process_set: ProcessSet): function = _check_function(_broadcast_function_factory, tensor) try: handle = getattr(mpi_lib, function)( - tensor, output, root_rank, name.encode() if name is not None else _NULL) + tensor, output, root_rank, name.encode() if name is not None else _NULL, + process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tensor, output) return handle -def broadcast_async(tensor, root_rank, name=None): +def broadcast_async(tensor, root_rank, name=None, process_set=global_process_set): """ A function that asynchronously broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The input tensor is not modified. @@ -611,33 +802,36 @@ def broadcast_async(tensor, root_rank, name=None): tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the broadcast operation that can be used with `poll()` or `synchronize()`. """ output = tensor.new(tensor.shape) - return _broadcast_async(tensor, output, root_rank, name) + return _broadcast_async(tensor, output, root_rank, name, process_set) class HorovodBroadcast(torch.autograd.Function): """An autograd function that broadcasts a tensor.""" @staticmethod - def forward(ctx, tensor, root_rank, name): + def forward(ctx, tensor, root_rank, name, process_set: ProcessSet): ctx.root_rank = root_rank - handle = broadcast_async(tensor, root_rank, name) + ctx.process_set = process_set + handle = broadcast_async(tensor, root_rank, name, process_set) return synchronize(handle) @staticmethod def backward(ctx, grad_output): - grad_reduced = allreduce(grad_output, average=False) + grad_reduced = allreduce(grad_output, average=False, process_set=ctx.process_set)#True CHANGED if rank() != ctx.root_rank: grad_reduced *= 0 - return grad_reduced, None, None + return grad_reduced, None, None, None -def broadcast(tensor, root_rank, name=None): +def broadcast(tensor, root_rank, name=None, process_set=global_process_set): """ A function that broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The input tensor is not modified. @@ -655,15 +849,17 @@ def broadcast(tensor, root_rank, name=None): tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, with the value broadcasted from root rank. """ - return HorovodBroadcast.apply(tensor, root_rank, name) + return HorovodBroadcast.apply(tensor, root_rank, name, process_set) -def broadcast_async_(tensor, root_rank, name=None): +def broadcast_async_(tensor, root_rank, name=None, process_set=global_process_set): """ A function that asynchronously broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The operation is performed in-place. @@ -677,15 +873,17 @@ def broadcast_async_(tensor, root_rank, name=None): tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the broadcast operation that can be used with `poll()` or `synchronize()`. """ - return _broadcast_async(tensor, tensor, root_rank, name) + return _broadcast_async(tensor, tensor, root_rank, name, process_set) -def broadcast_(tensor, root_rank, name=None): +def broadcast_(tensor, root_rank, name=None, process_set=global_process_set): """ A function that broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The operation is performed in-place. @@ -699,35 +897,37 @@ def broadcast_(tensor, root_rank, name=None): tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, with the value broadcasted from root rank. """ - handle = broadcast_async_(tensor, root_rank, name) + handle = broadcast_async_(tensor, root_rank, name, process_set) return synchronize(handle) def _alltoall_function_factory(tensor): return 'horovod_torch_alltoall_async_' + tensor.type().replace('.', '_') -def _alltoall_async(tensor, splits, output, name): +def _alltoall_async(tensor, splits, output, output_received_splits, name, process_set: ProcessSet): if splits is None: # If splits not provided, create empty tensor as placeholder splits = torch.tensor([], dtype=torch.int32, device='cpu') elif not isinstance(splits, torch.Tensor): splits = torch.tensor(splits, dtype=torch.int32, device='cpu') - function = _check_function(_alltoall_function_factory, tensor) try: handle = getattr(mpi_lib, function)( - tensor, splits, output, name.encode() if name is not None else _NULL) + tensor, splits, output, output_received_splits, name.encode() if name is not None else _NULL, + process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) - _handle_map[handle] = (tensor, splits, output) + _handle_map[handle] = (tensor, splits, (output, output_received_splits)) return handle -def alltoall_async(tensor, splits=None, name=None): +def alltoall_async(tensor, splits=None, name=None, process_set=global_process_set): """ A function that scatters slices of the input tensor to all other Horovod processes and returns a tensor of gathered slices from all other Horovod processes. The input @@ -745,37 +945,45 @@ def alltoall_async(tensor, splits=None, name=None): not provided, the first dimension is split equally by the number of Horovod processes. name: A name of the alltoall operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the alltoall operation that can be used with `poll()` or `synchronize()`. """ output = tensor.new() - return _alltoall_async(tensor, splits, output, name) + if isinstance(splits, torch.Tensor): + output_received_splits = splits.new() + else: + output_received_splits = torch.empty(size(), dtype=torch.int32, device='cpu') + return _alltoall_async(tensor, splits, output, output_received_splits, name, process_set) class HorovodAlltoall(torch.autograd.Function): """An autograd function that performs alltoall on a tensor.""" @staticmethod - def forward(ctx, tensor, splits, name): - ctx.tensor = tensor - ctx.splits = splits - handle = alltoall_async(tensor, splits, name) - return synchronize(handle) + def forward(ctx, tensor, splits, name, process_set: ProcessSet): + handle = alltoall_async(tensor, splits, name, process_set) + output, received_splits = synchronize(handle) + + ctx.process_set = process_set + ctx.recvsplits = received_splits + if splits is None: + return output + else: + ctx.mark_non_differentiable(received_splits) + return output, received_splits @staticmethod - def backward(ctx, grad_output): - recvsplits = None - if ctx.splits is not None: - recvsplits = alltoall(ctx.splits, splits=torch.ones(size(), dtype=torch.int32, device='cpu')) - else: - splits_equal = torch.ones(size(), dtype=torch.int32, device='cpu') * (ctx.tensor.size()[0] // size()) - recvsplits = alltoall(splits_equal, splits=torch.ones(size(), dtype=torch.int32, device='cpu')) - return alltoall(grad_output, splits=recvsplits), None, None + def backward(ctx, grad_output, *dead_gradients): + grad_wrt_tensor, _ = alltoall(grad_output, splits=ctx.recvsplits, + process_set=ctx.process_set) + return grad_wrt_tensor, None, None, None -def alltoall(tensor, splits=None, name=None): +def alltoall(tensor, splits=None, name=None, process_set=global_process_set): """ A function that scatters slices of the input tensor to all other Horovod processes and returns a tensor of gathered slices from all other Horovod processes. The input @@ -797,22 +1005,263 @@ def alltoall(tensor, splits=None, name=None): not provided, the first dimension is split equally by the number of Horovod processes. name: A name of the alltoall operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: - A tensor containing the gathered tensor data from all workers. + 1) A tensor containing the gathered tensor data from all workers. + 2) If `splits` has been provided: A tensor of integers in rank order + describing how many elements in the output tensor have been received + from each worker. + """ + return HorovodAlltoall.apply(tensor, splits, name, process_set) + + +def _reducescatter_function_factory(tensor): + return 'horovod_torch_reducescatter_async_' + tensor.type().replace('.', '_') + + +def _reducescatter_async(tensor, output, name, op, process_set: ProcessSet, + prescale_factor, postscale_factor): + function = _check_function(_reducescatter_function_factory, tensor) + try: + handle = getattr(mpi_lib, function)(tensor, output, + name.encode() if name is not None else _NULL, + op, process_set.process_set_id, + prescale_factor, postscale_factor) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tensor, output) + return handle + + +def reducescatter_async(tensor, name=None, op=Average, process_set=global_process_set, + prescale_factor=1.0, postscale_factor=1.0): """ - return HorovodAlltoall.apply(tensor, splits, name) + A function that performs asynchronous reduction of the input tensor over all the + Horovod processes, then scatters the results across all Horovod processes. The + input tensor is not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + The input tensors on the different processes must have the same rank and shape. The + output tensor will be the same rank on all processes, but the first dimension may + be different. + + Arguments: + tensor: A tensor to average and sum. + name: A name of the reduction operation. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensor before reducescatter. + postscale_factor: Multiplicative factor to scale tensor after reducescatter. + + Returns: + A handle to the reducescatter operation that can be used with `poll()` or + `synchronize()`. + """ + output = tensor.new() + return _reducescatter_async(tensor, output, name, op, process_set, + prescale_factor, postscale_factor) + + +class HorovodReducescatter(torch.autograd.Function): + """An autograd function that performs reducescatter on a tensor.""" + + @staticmethod + def forward(ctx, tensor, name, op, process_set, prescale_factor, postscale_factor): + ctx.op = op + ctx.process_set = process_set + ctx.prescale_factor = prescale_factor + ctx.postscale_factor = postscale_factor + handle = reducescatter_async(tensor, name, op, process_set, prescale_factor, postscale_factor) + return synchronize(handle) + + @staticmethod + def backward(ctx, grad_output): + if ctx.op == Sum: + grad_output *= ctx.process_set.size() + if ctx.prescale_factor != 1.0: + grad_output *= ctx.prescale_factor + if ctx.postscale_factor != 1.0: + grad_output *= ctx.postscale_factor + + return allgather(grad_output, process_set=ctx.process_set), None, None, None, None, None + + +def reducescatter(tensor, name=None, compression=Compression.none, op=Average, + process_set=global_process_set, prescale_factor=1.0, postscale_factor=1.0): + """ + A function that performs reduction of the input tensor over all the Horovod + processes, then scatters the results across all Horovod processes. The input tensor + is not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensor: A tensor to average/sum and scatter. + name: A name of the reduction operation. + compression: Compression algorithm used during reducescatter to reduce the amount + of data sent during the each parameter update step. Defaults to + not using compression. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensor before reducescatter. + postscale_factor: Multiplicative factor to scale tensor after reducescatter. + + Returns: + A tensor of the same rank and type as `tensor` across all processes. The shape + is identical to the input shape, except for the first dimension, which will be + divided across the different Horovod processes. + """ + tensor_compressed, ctx = compression.compress(tensor) + reduced_tensor_compressed = HorovodReducescatter.apply(tensor_compressed, name, op, process_set, + prescale_factor, postscale_factor) + return compression.decompress(reduced_tensor_compressed, ctx) + + +def _grouped_reducescatter_function_factory(tensor): + return 'horovod_torch_grouped_reducescatter_async_' + tensor.type().replace('.', '_') + + +def _grouped_reducescatter_async(tensors, outputs, name, op, process_set: ProcessSet, + prescale_factor, postscale_factor): + function = _check_function(_grouped_reducescatter_function_factory, tensors[0]) + try: + handle = getattr(mpi_lib, function)(tensors, outputs, + name.encode() if name is not None else _NULL, + op, process_set.process_set_id, + prescale_factor, postscale_factor) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tuple(tensors), tuple(outputs)) + return handle + + +def grouped_reducescatter_async(tensors, name=None, op=Average, process_set=global_process_set, + prescale_factor=1.0, postscale_factor=1.0): + """ + A function that performs asynchronous reduction of a list of input tensors over all the + Horovod processes, then scatters the results across all Horovod processes. The + input tensors are not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. For each of the input tensors, the type and shape must + be the same on all Horovod processes for a given name. The reduction will not start + until all processes are ready to send and receive the tensors. + + The input tensor at some place in the list tensor must have the same rank and shape + on the different processes. The corresponding output tensor will be the same rank on + all processes, but the first dimension may be different. + + Arguments: + tensors: A list of tensors to average and sum. + name: A base name to use for the group reduction operation. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensors before reducescatter. + postscale_factor: Multiplicative factor to scale tensors after reducescatter. + + Returns: + A handle to the group reducescatter operation that can be used with `poll()` or + `synchronize()`. + """ + outputs = [t.new() for t in tensors] + return _grouped_reducescatter_async(tensors, outputs, name, op, process_set, + prescale_factor, postscale_factor) + + +class HorovodGroupedReducescatter(torch.autograd.Function): + """An autograd function that performs reducescatter on a list of tensors.""" + + @staticmethod + def forward(ctx, name, op, process_set, prescale_factor, postscale_factor, *tensors): + ctx.op = op + ctx.process_set = process_set + ctx.prescale_factor = prescale_factor + ctx.postscale_factor = postscale_factor + handle = grouped_reducescatter_async(list(tensors), name, op, process_set, + prescale_factor, postscale_factor, ) + return synchronize(handle) + + @staticmethod + def backward(ctx, *grad_output): + if ctx.op == Sum: + grad_output = [g * ctx.process_set.size() for g in grad_output] + if ctx.prescale_factor != 1.0: + grad_output = [ctx.prescale_factor * g for g in grad_output] + if ctx.postscale_factor != 1.0: + grad_output = [ctx.postscale_factor * g for g in grad_output] + + return (None, None, None, None, None, + *grouped_allgather(grad_output, process_set=ctx.process_set)) + + +def grouped_reducescatter(tensors, name=None, compression=Compression.none, op=Average, + process_set=global_process_set, prescale_factor=1.0, postscale_factor=1.0): + """ + A function that performs reduction of a list of input tensors over all the + Horovod processes, then scatters the results across all Horovod processes. The + input tensors are not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensors: A list of tensors to average and sum. + name: A base name to use for the group reduction operation. + compression: Compression algorithm used during reducescatter to reduce the amount + of data sent during the each parameter update step. Defaults to + not using compression. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensors before reducescatter. + postscale_factor: Multiplicative factor to scale tensors after reducescatter. + + + Returns: + A list containing tensors of the same rank and type as in `tensors`. For each + tensor the shape is identical to the input shape, except for the first dimension, + which will be divided across the different Horovod processes. + """ + tensors_compressed = [] + ctxs = [] + for tensor in tensors: + tensor_compressed, ctx = compression.compress(tensor) + tensors_compressed.append(tensor_compressed) + ctxs.append(ctx) + reduced_tensors_compressed = HorovodGroupedReducescatter.apply(name, op, process_set, + prescale_factor, postscale_factor, + *tensors_compressed) + return [compression.decompress(reduced_tensor_compressed, ctx) + for reduced_tensor_compressed, ctx in zip(reduced_tensors_compressed, ctxs)] def poll(handle): """ - Polls an allreduce, allgather or broadcast handle to determine whether underlying - asynchronous operation has completed. After `poll()` returns `True`, `synchronize()` - will return without blocking. + Polls an allreduce, allgather, alltoall, broadcast, or reducescatter handle to determine whether + underlying asynchronous operation has completed. After `poll()` returns `True`, + `synchronize()` will return without blocking. Arguments: - handle: A handle returned by an allreduce, allgather or broadcast asynchronous - operation. + handle: A handle returned by an allreduce, allgather, alltoall, broadcast, or reducescatter + asynchronous operation. Returns: A flag indicating whether the operation has completed. @@ -822,15 +1271,15 @@ def poll(handle): def synchronize(handle): """ - Synchronizes an asynchronous allreduce, allgather or broadcast operation until - it's completed. Returns the result of the operation. + Synchronizes an asynchronous allreduce, allgather, alltoall, broadcast, or reducescatter operation + until it's completed. Returns the result of the operation. Arguments: - handle: A handle returned by an allreduce, allgather or broadcast asynchronous - operation. + handle: A handle returned by an allreduce, allgather, alltoall, broadcast, or reducescatter + asynchronous operation. Returns: - An output tensor of the operation. + A single output tensor of the operation or a tuple of multiple output tensors. """ if handle not in _handle_map: return @@ -840,10 +1289,11 @@ def synchronize(handle): output = _handle_map.pop(handle)[-1] return output except RuntimeError as e: + _handle_map.pop(handle, None) raise HorovodInternalError(e) -def join(device=-1): +def join(device=-1) -> int: """A function that indicates that the rank finished processing data. All ranks that did not call join() continue to process allreduce operations. @@ -855,7 +1305,32 @@ def join(device=-1): Returns: Id of the rank that joined last. """ + output = torch.tensor(-1, dtype=torch.int, device=torch.device("cpu")) + try: + handle = mpi_lib.horovod_torch_join(output, device) + except RuntimeError as e: + raise HorovodInternalError(e) + + _handle_map[handle] = (None, output) + + return synchronize(handle).item() + +def barrier(process_set=global_process_set): + """ + A function that acts as a simple sychronization point for ranks specified + in the given process group(default to global group). Ranks that reach + this function call will stall until all other ranks have reached. + + Arguments: + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + """ + try: - return mpi_lib.horovod_torch_join(device) + handle = mpi_lib.horovod_torch_barrier(process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) + + _handle_map[handle] = (None, None) + + synchronize(handle) diff --git a/patch_files/horovod/torch/optimizer.py b/patch_files/horovod/torch/optimizer.py index ffe7d42..8c742ae 100644 --- a/patch_files/horovod/torch/optimizer.py +++ b/patch_files/horovod/torch/optimizer.py @@ -25,28 +25,31 @@ from horovod.torch.compression import Compression from horovod.torch.functions import broadcast_object -from horovod.torch.mpi_ops import allreduce_async_, grouped_allreduce_async_ +from horovod.torch.mpi_ops import allreduce_async_, grouped_allreduce_async_, sparse_allreduce_async from horovod.torch.mpi_ops import synchronize from horovod.torch.mpi_ops import size from horovod.torch.mpi_ops import Average, Adasum, Sum from horovod.torch.mpi_ops import rocm_built +from horovod.torch.mpi_ops import ProcessSet, global_process_set class _DistributedOptimizer(torch.optim.Optimizer): - def __init__(self, params, named_parameters, grace, compression, + def __init__(self, params, named_parameters,grace, compression, backward_passes_per_step=1, op=Average, gradient_predivide_factor=1.0, - num_groups=0): + groups=None, + sparse_as_dense=False, + process_set=global_process_set): super(self.__class__, self).__init__(params) - self._compression = compression self._grace = grace + self._compression = compression if named_parameters is not None: named_parameters = list(named_parameters) else: - named_parameters = [('allreduce.noname.%s' % i, v) - for param_group in self.param_groups - for i, v in enumerate(param_group['params'])] + named_parameters = [(f'allreduce.noname.{i}.{j}', v) + for i, param_group in enumerate(self.param_groups) + for j, v in enumerate(param_group['params'])] # make sure that named_parameters are tuples if any([not isinstance(p, tuple) for p in named_parameters]): raise ValueError('named_parameters should be a sequence of ' @@ -74,15 +77,33 @@ def __init__(self, params, named_parameters, grace, compression, for _, v in sorted(named_parameters)} self.op = op self.gradient_predivide_factor = gradient_predivide_factor + self.sparse_as_dense = sparse_as_dense + self.process_set = process_set + self._handles = {} self._grad_accs = [] self._requires_update = set() self._synchronized = False self._should_synchronize = True - self._num_groups = num_groups + + if groups is not None: + if not (isinstance(groups, list) or groups > 0): + raise ValueError('groups should be a non-negative integer or ' + 'a list of list of torch.Tensor.') + if isinstance(groups, list): + grouped_parameter_ids = set() + for l in groups: + for p in l: + if not isinstance(p, torch.Tensor): + raise ValueError('groups must consist of torch.Tensor.') + if id(p) in grouped_parameter_ids: + raise ValueError('A parameter can only appear once in groups.') + grouped_parameter_ids.add(id(p)) + self._groups = groups self._p_to_group = {} self._group_counts = {} - if size() > 1 or os.environ.get('HOROVOD_ELASTIC') == '1': + + if self.process_set.included() and (size() > 1 or os.environ.get('HOROVOD_ELASTIC') == '1'): self._register_hooks() def load_state_dict(self, *args, **kwargs): @@ -109,8 +130,7 @@ def set_backward_passes_per_step(self, passes): self._allreduce_delay[p] = self.backward_passes_per_step def _register_hooks(self): - - if self._num_groups > 0: + if self._groups is not None: p_list = [] # Get list of parameters with grads for param_group in self.param_groups: @@ -121,16 +141,29 @@ def _register_hooks(self): # To ensure parameter order and group formation is consistent, broadcast p_list order # from rank 0 and use for every worker p_list_names = [self._parameter_names.get(p) for p in p_list] - p_list_names = broadcast_object(p_list_names, root_rank=0) - p_list = sorted(p_list, key=lambda p : p_list_names.index(self._parameter_names.get(p))) + p_list_names = broadcast_object(p_list_names, root_rank=0, process_set=self.process_set) + p_list = sorted(p_list, key=lambda p: p_list_names.index(self._parameter_names.get(p))) # Form groups - p_groups = split_list(p_list, self._num_groups) + if isinstance(self._groups, list): + p_groups = [] + grouped_id = set() + p_list_ids = [id(p) for p in p_list] + for group in self._groups: + p_groups.append([p for p in group if id(p) in p_list_ids]) + for p in p_groups[-1]: + grouped_id.add(id(p)) + for p in p_list: + if id(p) not in grouped_id: + p_groups.append([p]) + else: + p_groups = split_list(p_list, self._groups) + p_groups = [tuple(p) for p in p_groups] for group in p_groups: - for p in group: - self._p_to_group[p] = group - self._group_counts[group] = 0 + for p in group: + self._p_to_group[p] = group + self._group_counts[group] = 0 for param_group in self.param_groups: for p in param_group['params']: @@ -143,16 +176,36 @@ def _register_hooks(self): self._grad_accs.append(grad_acc) def _allreduce_grad_async(self, p): + if p.grad is None: + # Gradient was not computed, but we still need to submit a tensor to allreduce + # as one of the other ranks may have computed it (due to dynamic forward functions). + # + # NOTE: this will not work if the gradient is sparse and we perform an allgather. + # Unfrotunately, there doesn't appear to be a good way to detect that the parameter will + # produce sparse gradients before computing the gradient. + p.grad = p.data.new(p.size()).zero_() + name = self._parameter_names.get(p) tensor = p.grad - if self._grace and self._num_groups == 0 and self.op == Average: + + if p.grad.is_sparse: + if self.sparse_as_dense: + tensor = tensor.to_dense() + else: + return self._sparse_allreduce_grad_async(p, name) + # print("self._compression.",self._compression.compress) + # print("self._compression.",self._compression.compressor) + if self._grace and self._groups == 0 and self.op == Average: + # print("HERE") handle, ctx = self._grace.send_step(tensor, name) + postscale_factor = self.gradient_predivide_factor else: + tensor_compressed, ctx = self._compression.compress(tensor) if self.op == Average: - # Split average operation across pre/postscale factors - # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. + # Split average operation across pre/postscale factors + # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. prescale_factor = 1.0 / self.gradient_predivide_factor postscale_factor = self.gradient_predivide_factor else: @@ -160,17 +213,24 @@ def _allreduce_grad_async(self, p): postscale_factor = 1.0 handle = allreduce_async_(tensor_compressed, name=name, op=self.op, - prescale_factor=prescale_factor, - postscale_factor=postscale_factor) + prescale_factor=prescale_factor, + postscale_factor=postscale_factor, + process_set=self.process_set) return handle, ctx def _grouped_allreduce_grad_async(self, ps): name = self._parameter_names.get(ps[0]) tensors_compressed, ctxs = zip(*[self._compression.compress(p.grad) for p in ps]) - handle = grouped_allreduce_async_(tensors_compressed, name=name, op=self.op) + handle = grouped_allreduce_async_(tensors_compressed, name=name, op=self.op, + process_set=self.process_set) return handle, ctxs + def _sparse_allreduce_grad_async(self, p, name): + handle = sparse_allreduce_async(p.grad, name=name, op=self.op, + process_set=self.process_set) + return handle, None + def _make_hook(self, p): def hook(*ignore): if p in self._handles and self._handles[p][0] is not None: @@ -185,7 +245,7 @@ def hook(*ignore): handle, ctx = None, None self._allreduce_delay[p] -= 1 if self._allreduce_delay[p] == 0: - if self._num_groups > 0: + if self._groups is not None: group = self._p_to_group[p] self._group_counts[group] += 1 if self._group_counts[group] == len(group): @@ -202,6 +262,10 @@ def hook(*ignore): return hook def synchronize(self): + if not self.process_set.included(): + self._synchronized = True + return + completed = set() for x in self._handles.keys(): completed.update(x) if isinstance(x, tuple) else completed.add(x) @@ -214,23 +278,36 @@ def synchronize(self): if handle is None: handle, ctx = self._allreduce_grad_async(p) self._handles[p] = (handle, ctx) - for p, (handle, ctx) in self._handles.items(): + if isinstance(p, tuple): # This was a grouped result, need to unpack outputs = synchronize(handle) for gp, output, gctx in zip(p, outputs, ctx): self._allreduce_delay[gp] = self.backward_passes_per_step gp.grad.set_(self._compression.decompress(output, gctx)) + if self._groups is not None and self._group_counts[p] != 0: + self._group_counts[p] = 0 else: - if self._grace and self._num_groups == 0 and self.op == Average: - # in GRACE, p is not tuple, but handle is. - output = self._grace.receive_step(handle, ctx) - self._allreduce_delay[p] = self.backward_passes_per_step - p.grad.set_(output) + # When handle is a callable function, it returns the aggregated tensor result + if self._grace and self._groups == 0 and self.op == Average: + output = self._grace.receive_step(handle, ctx) + p.grad.set_(output) else: - output = synchronize(handle) + output = synchronize(handle) if not callable(handle) else handle() self._allreduce_delay[p] = self.backward_passes_per_step + if self._groups is not None: + group = self._p_to_group[p] + if self._group_counts[group] != 0: + self._group_counts[group] = 0 + if p.grad.is_sparse: + aggregated = self._compression.decompress(output, ctx) + if not aggregated.is_sparse: + # When sparse_as_dense=True we need to convert the grad back to sparse before update + aggregated = aggregated.to_sparse() + + # Sparse grads do not support set_ for some reason, so we do this as an equivalent + p.grad.zero_().add_(aggregated) p.grad.set_(self._compression.decompress(output, ctx)) self._handles.clear() @@ -448,13 +525,14 @@ def zero_grad(self): return super(self.__class__, self).zero_grad() -def DistributedOptimizer(optimizer, - grace=None, named_parameters=None, +def DistributedOptimizer(optimizer, named_parameters=None,grace=None, compression=Compression.none, backward_passes_per_step=1, op=Average, gradient_predivide_factor=1.0, - num_groups=0): + num_groups=0, groups=None, + sparse_as_dense=False, + process_set=global_process_set): """ An optimizer that wraps another torch.optim.Optimizer, using an allreduce to combine gradient values before applying gradients to model weights. @@ -499,6 +577,18 @@ def DistributedOptimizer(optimizer, gradient_predivide_factor / size after the sum. num_groups: Number of groups to assign gradient allreduce ops to for explicit grouping. Defaults to no explicit groups. + groups: The parameter to group the gradient allreduce ops. Accept values is a + non-negative integer or a list of list of torch.Tensor. + If groups is a non-negative integer, it is the number of groups to assign + gradient allreduce ops to for explicit grouping. + If groups is a list of list of torch.Tensor. Tensors in the same + inner list will be assigned to the same group, while parameter that does + not appear in any list will form a group itself. + Defaults as None, which is no explicit groups. + sparse_as_dense: If set True, convert all sparse gradients to dense and perform allreduce, then + convert back to sparse before applying the update. + process_set: Gradients will only be reduced over Horovod processes belonging + to this process set. Defaults to the global process set. """ # We dynamically create a new class that inherits from the optimizer that was passed in. # The goal is to override the `step()` method with an allreduce implementation. @@ -508,12 +598,23 @@ def DistributedOptimizer(optimizer, if op != Average: raise ValueError('gradient_predivide_factor not supported with op != Average') + if num_groups != 0: + warnings.warn('Parameter `num_groups` has been replaced by `groups` ' + 'and will be removed in v0.23.0.', DeprecationWarning) + if groups is None: + groups = num_groups + + if backward_passes_per_step <= 0: + raise ValueError("backward_passes_per_step must be > 0") + if op != Adasum or size() == 1: cls = type(optimizer.__class__.__name__, (optimizer.__class__,), dict(_DistributedOptimizer.__dict__)) - return cls(optimizer.param_groups, named_parameters, grace, compression, backward_passes_per_step, op, - gradient_predivide_factor, num_groups) + return cls(optimizer.param_groups, named_parameters,grace, compression, backward_passes_per_step, op, + gradient_predivide_factor, groups, sparse_as_dense, process_set) else: + if process_set != global_process_set: + raise NotImplementedError("Adasum does not support non-global process sets yet.") cls = type(optimizer.__class__.__name__, (optimizer.__class__,), dict(_DistributedAdasumOptimizer.__dict__)) return cls(optimizer.param_groups, named_parameters, compression, backward_passes_per_step) From 8c1f704970312dd5ac07987769474aff8fc93227 Mon Sep 17 00:00:00 2001 From: Garvit Banga Date: Sat, 20 Jul 2024 17:14:18 -0400 Subject: [PATCH 02/44] HOROVOD new version compatible for pytorch --- .DS_Store | Bin 8196 -> 8196 bytes examples/.DS_Store | Bin 6148 -> 6148 bytes examples/dist/.DS_Store | Bin 0 -> 6148 bytes examples/torch/.DS_Store | Bin 0 -> 6148 bytes .../__pycache__/pytorch_mnist.cpython-310.pyc | Bin 6384 -> 0 bytes grace_dl/.DS_Store | Bin 6148 -> 6148 bytes grace_dl/__pycache__/__init__.cpython-310.pyc | Bin 185 -> 0 bytes grace_dl/dist/.DS_Store | Bin 0 -> 6148 bytes grace_dl/tensorflow/.DS_Store | Bin 8196 -> 8196 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 4998 -> 0 bytes grace_dl/tensorflow/communicator/.DS_Store | Bin 0 -> 6148 bytes .../__pycache__/allgather.cpython-310.pyc | Bin 1622 -> 0 bytes grace_dl/tensorflow/compressor/.DS_Store | Bin 0 -> 6148 bytes .../__pycache__/topk.cpython-310.pyc | Bin 1969 -> 0 bytes grace_dl/tensorflow/memory/.DS_Store | Bin 0 -> 6148 bytes .../__pycache__/residual.cpython-310.pyc | Bin 1295 -> 0 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 2858 -> 0 bytes .../torch/__pycache__/helper.cpython-310.pyc | Bin 3310 -> 0 bytes grace_dl/torch/communicator/.DS_Store | Bin 0 -> 6148 bytes .../__pycache__/allgather.cpython-310.pyc | Bin 1961 -> 0 bytes .../__pycache__/allreduce.cpython-310.pyc | Bin 1111 -> 0 bytes .../__pycache__/broadcast.cpython-310.pyc | Bin 1549 -> 0 bytes grace_dl/torch/compressor/.DS_Store | Bin 0 -> 6148 bytes .../__pycache__/fp16.cpython-310.pyc | Bin 974 -> 0 bytes .../__pycache__/randomk.cpython-310.pyc | Bin 1897 -> 0 bytes .../__pycache__/topk.cpython-310.pyc | Bin 1654 -> 0 bytes grace_dl/torch/memory/.DS_Store | Bin 0 -> 6148 bytes .../memory/__pycache__/none.cpython-310.pyc | Bin 749 -> 0 bytes .../__pycache__/residual.cpython-310.pyc | Bin 1093 -> 0 bytes patch_files/.DS_Store | Bin 6148 -> 6148 bytes patch_files/horovod/.DS_Store | Bin 6148 -> 6148 bytes patch_files/horovod/_keras/__init__.py | 183 ----- patch_files/horovod/keras/__init__.py | 183 ----- patch_files/horovod/tensorflow/__init__.py | 761 ------------------ .../horovod/tensorflow/keras/__init__.py | 225 ------ update_patch_files.sh | 5 - 36 files changed, 1357 deletions(-) create mode 100644 examples/dist/.DS_Store create mode 100644 examples/torch/.DS_Store delete mode 100644 examples/torch/__pycache__/pytorch_mnist.cpython-310.pyc delete mode 100644 grace_dl/__pycache__/__init__.cpython-310.pyc create mode 100644 grace_dl/dist/.DS_Store delete mode 100644 grace_dl/tensorflow/__pycache__/__init__.cpython-310.pyc create mode 100644 grace_dl/tensorflow/communicator/.DS_Store delete mode 100644 grace_dl/tensorflow/communicator/__pycache__/allgather.cpython-310.pyc create mode 100644 grace_dl/tensorflow/compressor/.DS_Store delete mode 100644 grace_dl/tensorflow/compressor/__pycache__/topk.cpython-310.pyc create mode 100644 grace_dl/tensorflow/memory/.DS_Store delete mode 100644 grace_dl/tensorflow/memory/__pycache__/residual.cpython-310.pyc delete mode 100644 grace_dl/torch/__pycache__/__init__.cpython-310.pyc delete mode 100644 grace_dl/torch/__pycache__/helper.cpython-310.pyc create mode 100644 grace_dl/torch/communicator/.DS_Store delete mode 100644 grace_dl/torch/communicator/__pycache__/allgather.cpython-310.pyc delete mode 100644 grace_dl/torch/communicator/__pycache__/allreduce.cpython-310.pyc delete mode 100644 grace_dl/torch/communicator/__pycache__/broadcast.cpython-310.pyc create mode 100644 grace_dl/torch/compressor/.DS_Store delete mode 100644 grace_dl/torch/compressor/__pycache__/fp16.cpython-310.pyc delete mode 100644 grace_dl/torch/compressor/__pycache__/randomk.cpython-310.pyc delete mode 100644 grace_dl/torch/compressor/__pycache__/topk.cpython-310.pyc create mode 100644 grace_dl/torch/memory/.DS_Store delete mode 100644 grace_dl/torch/memory/__pycache__/none.cpython-310.pyc delete mode 100644 grace_dl/torch/memory/__pycache__/residual.cpython-310.pyc delete mode 100644 patch_files/horovod/_keras/__init__.py delete mode 100644 patch_files/horovod/keras/__init__.py delete mode 100644 patch_files/horovod/tensorflow/__init__.py delete mode 100644 patch_files/horovod/tensorflow/keras/__init__.py diff --git a/.DS_Store b/.DS_Store index 8e2c11e11d140309e12d25b0f48ef8971855e5d3..64c196cbe54c373dd976e1595fa9714f12bfc5b9 100644 GIT binary patch delta 28 icmZp1XmQxEUWk!#@&=(>KvGadYOKvGadYOw{M7)Y3fA%!86p_rirF14A5C6|#C V!A)e{%E-ttS)6^>W_FIh`~YFP7FqxR delta 55 zcmZoMXfc@JFDS^sz`)4BAi$85ZWx@LpIfk5kYhD7qu^#mmR!co>sjYAGBQr~W#2Wi Lz-%)+$6tN`dF&4W diff --git a/examples/dist/.DS_Store b/examples/dist/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..086c2e5940a62da8ca0fabc11f205d5c2741ddae GIT binary patch literal 6148 zcmeHKyG{c!5S)b+k!Y@@^ba7BKR7}_qJyKdtbO4rgM|}A(H{W-k*-1r=Naq>%Sm6>E81OX9{ypH_4%fKDEv_c~ zPrUJ*SM2dV>|eIeyWQ54Wl}&2NC7Dz1*E`13RF2=o*i_G$WlNG9HRpMeQ0#YzHm&8 zPX~u+0f-BR!#Iy#g4jGj>)|36Qdk+;pO-|k}|LPocn#@m>6`%gHF`Xfa@ZY0{^YR4<H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0um z`iH7+&aOrz5@3W3b&#E75^MkxFpTWQ2VV>aIRv@nkV6poRyQL7fl;&(8#D0cw0VfO4$(Hzs-M-e_WG;0-Lmt(femd1Zt=p5?YDNKVyigs z=|)AS=~`53mFJvFRBct~^g^o!eNK2JT5K&wORc3jzuu~=agIi{*0SOi!(-9$)^SC< z;fd&E>!hMf;i>3!>$IZF;hE@c>#U+#SZmd8X}rR#k2PKmiv4rjMr(yH@Y-Xobw2nC zKeEBZ-v+NtFU+*o#o%K9)v}iSMNsQsn!YyEf{WfI@EYQ;gV*`uHmy~?|3+{=xY*H& zkLOK1Z}FuKrrz}p#*aR=F*dua1AK9Fzou-dLZT% z5%6I0E2E9j&8t zowLVI%isn#v1jIvW@;@9t7%DNEy&t1@ckdr79LWTevBg1GJT|NvniXw{4%3&k~&iL z)EH@=r8o?RyT`FJ81hVm%mG#8HkmYPuBsI{Q?(M;Fac+6mj>J=SKOsx@tLKKK*T}l zrM<0Sw@!jdT*7nG0na2Ycq)Venivf-@ub@ zqR6zVjt6TzHAeaj+f5KMCf*+3gv=t@fpnx#ZQ92f%UhyFA#XwTzB9tE_lwwN$RTq{ zrdpPWBI8-xrK#*b!%r-L!V)?#Zfr^1#ZKw(OH%~lP`Z)-nKww1@Cp}4(M}wvVi|=r zH+#XS;v|s;+81n;c!i{=h*b)cu9qgA34yp1QdQ8*e#b`5^ zTb^ml8YG3RGOglSm}w|$+~uX2ninrWHOCCPkMtM&Oa1zYq1REgQhj;lsqq9(t)$B% z*3mKAv5Cv8pRdhraL$hiAot|wN7h~q@1^i7(JNz$ZFPQ>FY{xc8{@*r9u>CrCk*oA;f3hp*n#F( z`H5|PT-?!i^c}Wi?pQnaPI1TWIQ%3(_1GD^;M*O>PxCX6ohJs`8LCxb$LZMoEI)@f zrR)+n@O`gw;}`(^ruKhc{E=6?xXQ4Uavj6 z+WfW7_+sOe;_W0&A-dZ9jn4RAFc5yi7|1f^2{;V|V$%<$rA`LYrWD)Hq!9+OG<#VPHHy-jkENDDP2JrmFU!)N z+v7kXi#h1@_-E22P?L7%i*Arf8%fGwm`Ri7(r}~+Hs%y;1~Gx8G`gZKEo2r!2=JiS zWm!l#SHiM_KYWJIv2K;THixu>oJK9q=lJOKHj3qfsoT$%SzgxkoO+$P`m#a1*O{Zc ztghS08*2t=-2tbpJ1PTmSe4nvt7ymm$o$u>dIa0)G~Wr6wjZYNHq{KC%P(o$=~007 zO%Mo`$N-W3v5ZX(3V(1329P5AhX>nxq3rOjaQO@*fX{PeI zX+`zY9~^_e>I4XOhIHurv_63u@ZvTbYoK)Qg3_}EUYgdxFOPtTGt3NM$G8?A8QZ|@ zMPA8oGqCy9Lz~~-TfO%w#~{qv+YXg|+yl3;5d;I?i&_qCDLGrKOqAbl4{@x2xb;={ zUXW%h@VToib7)ziW(xSe))~LKa(&)ZajpOwT|o{d=(N>XI%HyIKo<2Kk`dO3?@~d= zCe~3%0|yYvf(Nod=M@jBiP0VmrPYBS!{GsP!l%X_6@NfIOh99R!)l0cXcSt-3^UEY zp`ysWk5KwWkNWjhKSF!44YFa2kk}8YNT?W~kfpgMg&$>A`6c+k=os#w(L8OUpmUG| z^e}-53L{kgIIFV@df6=NuI1`>!2v=5EA*|U7c8H2N&Rx8q1H>f-^vrhyMQ{Aj^ARa z<`wxd4p0EOfM6M=T2AQ_!t-FW*AAqsKu0BIa)J1i?PcpCz%7iRd{bI?*1mD~o^pFZ zxuz`84M>s((z%!1Q@|jLYXs~eOh#J6tQV&F9+mh(WRi=A!qnY7VHDpceeO*_`E7;1 z;wmu~)Dj*fnCj>z?m#5gsQ8cy3Xdu#%KCkQ&PYxXm-oXbJ~Bhx@_4VE$rA@H_Ct%O z<`HIo`_#$}W?mDVlmt{z*7*~DCK6UfHfxW z^9GuO?H`k1hYCSOO2rBa#6=1}g!t)W6jc~pjm*-lfpU#HTcj`t12unPK4Z_^LRFb1 z)7djy2M4vfNpj~GklG}zF*cXjmW{p?{PMt<2g*fr5mzczW$FlvMMpnbsD5?%X#I5x zo2D*iG%;KY(9}F{>%vFFn65BZRA^Mhx=4oI^_BHRB%29uiXk$JZkQwhhC#1k6nA_MWk0GxfhcOUO4kjYVKx-FZV+^F0^E3rbI1ULeONWw z?4`(mlt0dWaQ@x0vbO-LT@D9tyv$imMk%}s0n&hlh#!)+%E76Tl1h_o?9eUJK~mzplix+Xi~K=L33|#JC2{%R sK{5xcW@V&!d17}Uqx#~Cv#9-{^MRuy4MYk9Ur~SR<^FfsG5`I40Ar=-aR2}S diff --git a/grace_dl/.DS_Store b/grace_dl/.DS_Store index 91ecb82bbae644c2e3ee893ba7ffa04285e9d005..d7016090c772ce146d2f15a2161623ee629dab4f 100644 GIT binary patch delta 90 zcmZoMXfc@JFDT5wz`)4BAi$7PoSc)CpP#c?kYhEo7)Y3fA%!86p_rirCdIIshb5Ph U6TwYnU5mt>#J+1YJI7ys00FTXO8@`> delta 53 zcmZoMXfc@JFDSskz`)4BAi$85ZWx@LpIfk5kYhD7qrhfGmR!co>shBVPL5_@GqJ#T JGdss$egI_>4+{VQ diff --git a/grace_dl/__pycache__/__init__.cpython-310.pyc b/grace_dl/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 32999e2685154b35045ca1d132f2d922c9b72962..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 185 zcmd1j<>g`kf}m%OX`(>-F^Gc<7=auIATH(r5-AK(3@MDk44O<;%!YafewvK8xZ~r? zQj3Z+^Yh~4S27ea0abyCU!MA*#Xt%D^u(gF%#x(Uy!1qUm;Ca)oczR;VtvP=%#xx+ zm(r4wM1A)lM`u_4^rFP%R0tWLlA|9VpP83g5+AQuP-s*BA`e~xxc_4tfKG*`~Z-oKnfhnqoBKv-)8JJ=-@(20ppcDcRtUq zZi@36fXxr9YhVUoN_WJYhq3u{_mQ1s=9Fl$#{tiH#1i+|kE)L+oO?+|hbOKfzr%L5 zTW%h@ZsXMREPBBfYwWnb6`Ya3rZbJ#A@lgb%1Qw#AO)m=6!@nKuxGnXFB&RJ0VyB_ zJ{9oqL!mp?#J*vCIv8REAWoPL<8{mu#Nr8JP3#*oL$gLDHmc=_VU5mw$-0`@H*9oR z4j+~$TTUnzr*r=j<*?dNQ3^_f xHNDps`W^klSR3UG(Ta)Dih1L$_-asB{F={eV&5?6%m4BVYgk literal 0 HcmV?d00001 diff --git a/grace_dl/tensorflow/.DS_Store b/grace_dl/tensorflow/.DS_Store index 058d54775752fa9df575f3c4d9a1548570ada4ce..354ac1b19a8cd736f5444b51fb428fc7b68e8ed2 100644 GIT binary patch delta 80 zcmZp1XmOa}&nUVvU^hRb=;XHoGLtm~9&)A>C+8&P=jSkNek*X5kpsq^94IWg`H;{| WMkEE^BC*Vy*(JWQY@R2=&I|yMeH<$Q delta 49 zcmV-10M7q}K!iY$PXQ9KP`eKS5|cg=9Fr&z;j=CgxdF2X6PW?Cx)eZPewW_;krl&o2b~l@dRT6EJ1j2wBX9GmStQId`60NitiFJY)CF*T=)r{Ne z>7K3Xu{W(TB6!6Oap1t!KKd8%S5R|ALgLIJ5(nVDs{Wd>y;0h#=lXp0>V3aSR#zPY z->?4p#pt&$5%LclG(QUlThQbq0E{pikqI5ri7_-Nj*ZA1S|1W-GV3{E7N;lXum$55 zYs0v$#%&mPm%ZAcOAH85j^&4alr5^;zij|1DBKDCJ?$IW-<5WPM!Va|1Rf#%l)LFZG$jG| zC`|X&xFUDpR7W`(Zn>$|f!d^+Q9$dpl}`=KnbEFaLgHx=GsCIncHD^t8V z88t4ij|ra6L<|QsQ9(s2(*#N|GtOMaURfQ>Zr2;#BVW34lGe7$+2EY!YdcB$ury!J z&duDCJR0R~eN}GZT>4xb=&#p0`0qb7V0aO%jUS)3O{YWl`l=6Boy2ib*GO2K7ZadmR ziQAVkzdZLNSkw;LUSNH>{*|o%Yeme1O4Gg;m$gC`kG+bg@=C8*U0?W9<@HL|74flr zOw;RSw;N2o(r=w>EzUcIaKt;8Fw?0q{S=pf03ds1yRy?ky&j<7DnIO-(6*te93>?O z^wgL`o(iT;vp-aCyFVTaKE`ObtiJ_aFW}pvz8Cu|%(XU_SCocOZ2?QZ4ix6r#83D0 zmd}_M_YG~`s6NYv-StaU5Ak96%3deC+2BXDVW)5G?)JN)k7&M&;Cl$x5ujVp-uON2 zy^r9>2x>o~ddejaU;$56F4?gR3mkISp}nra)d1SUhn(MnCehEAE}5HHTc}FL(uLNr zaLYre5wJ4Id+?A6Sc)tV?TRBhUE>V#VNi)HH1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0-ksgX(gcgKi(uz`9a!5JZ z4X8um6#E*^aZf$6%iNh4d6axlge$x|B0R~KEb;~Yk&OHYSV_)C)P^U; zY_`bLNn92R=IuDklDNE(YJw)agRLj}QxIgmW#oJyQ!1!%?t+L3Cfqv`xgUs2A`iCl zZ2NfY4x`=-l#$S-eh#c8OR}Pd8t2}j#4tm+BM7*lXCr1ky_id71J2VtEjhQHn?hxR z>-1KRTJjV%;rjJT`23%2VUP>$E(N z^CTXh7B_iT#6pjcRa&a}bWxV^_&X_1iY)%(?GGo%Z^wy>C$iyOWaCoiu+uavZpN_j zwuIwt{f_4El@I&3F-J_cM?3U(ur0{ZCY#BouwX+DfpEnMkUpn`B&1?ME|6#FDf#8I z$~SC9O1g4@nDOu701|rd+*7EWTO%%NagJ67&K&WoWjsJV(^}TpRRAdX5>WFBDt_6z z4AdLL5(;vANU8=8YP_Ahu#zol)fUdyqEmJZ1>GJ*7nXatZiz9zsk)|Z_Dpw3kPG&V z49V}#izb&|8J1n+8F%T_)~|kcs(m28>YI+)2de{f06hTGGg1vqugTWOF^>I$P@^-K@@r+Q+-6){gR|Es7*_Dy(Ct0Bslef5Vysr?D`ml;Bjxn~dm!cb6c z@_=k~VEuSLm$|S`Ikhe5s*89oq0yk>pc`?b@YJk7DRQ_v3XjYlHa(&1zT~lzT*ot6 zH=$*j%&n)O+oWyFd;$8nl$PD5bK4Pe)4Op0l0-?kfzr0(YcNXW=uiddzO{C5*`$0I zg}a+Y-R%rxi#nTWoLIk+A9Y`f=wO3mLOe@#8SVd11?#*t7xJBD5NkAij8F=`wu~^H zU(={t|2A4nQ-w`zX>93vG-$xx_0eFn)izh5y0D~SJzN}WC_YzmBGYS$nEo1=OYsZn zOWLPBppXVm2oi870HjBs1Ahq?A@C0E0UA0_>Fv|UTes_YI1T>#szcDS>9UA($(!Rsc6Z?KIuwhe|5(jof~F#@)* diff --git a/grace_dl/tensorflow/compressor/.DS_Store b/grace_dl/tensorflow/compressor/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0AGn<4V8(0O<(gy?@U^5bX;eZgUL<#MK2UR~`H7{C4b^XT`8kY7=` zJpxd^g2#LgLJ~f8xb7q-jmN%@S8|0jxvpKiH zL3j|Myo?)iMH`N>{K8+bc`y%caG5lu39r~ZYN93_F^D7F)f9YRqj(C@L=c8mvfpu? zv9g)j7gOb8VY0F?*71oralELkV|L_%iLm=FwDmMQbX@Ei7mZ;>>3-}Yt158i*7D`; zGA(RTyI7TKQq3lg9y*%2xTs{2D>!*1%9%2+h<*zq`6D0vV3an4vCv0_-4oSV40h_{ zs;q@HgD1MMTI|fM6@#ZrZr7!F^z6IsC(j0BEpoMxnJfp3AVy_s zE!)-TpwD#&CU;MrOhqL#Sxk)Unp{|`bXLiJpzp)J-tb0?kTX-9DZPfvn2&{c8x6Wr zwPEA7!Pr-5j|Nn~3$HmmcSd^!^C>JVU#dg)l8mUNl3gT2Um~~5^(6_&FoN)7xAwfA ze)~V=iKU7U`ZpN5CFj04DW-vBOcL-xcl+FhW;RvYC0SNfh0QYk2AKDm3ydm9UU@}b zC(H6u7z5k>#&}u%5Cm!lli#IXdiLOMNL#JXJ5V8%cw!rm9)a*s(JKlRL7?F7027d+ z2`)KMDga6`pj2qXCTPe(bk2Y>(L4t0xB+>PfHi8JCR%DRUv_~;@fc_byGg^5Qu9ue zjA_&H)%^cLAaz8Fdn(3v7|JD=qt`=!v(njvW392$Dg@)udrT^^16be z>38+}XpZQ+pb``1R@$OLkHfX4HY%~gD&0eCM97z!dvm$n5<*GGb9?f7oh9x5CY&_u zAi8u7>a54s7|zLmq0yeXg9aA=4I1Bo($E(RB(OmN4$Jn0$OMpG?vZH`2pYsBKIiiU zG9ETb6Zvc>NU*aXcU1e$bni4B6=hje<5W$i_B1tRZB2Sy*!>j9RGSpomoD?3?uq;` zof))-mX+4J7#Ec&Q;*6uR(3yX5u29KZ$5%07eg-*fQsju{s65y9uBv@NUO=(c$Cyg zIsFNWoAm3CQGEx5yT1fN-dq8W{G08qzYpPzLC_=C10)i1Ri~Mc+ro81PH07;Z`>He*;k<>puVh diff --git a/grace_dl/tensorflow/memory/.DS_Store b/grace_dl/tensorflow/memory/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0r z03?w#Ax#SZUgGswlTq@5NGAPLB7Ma^v8FHS8&WU*1Z%Qgr#5({dZ|8Xp+317D66o| zLjdJ`Ha%I9o@A%A2_9wXlz_KDmf^o7L-1O)Gds$}av+kz>koJf>(}!Zx*IP{xdeM1 zws{Eo|~$N1XV z@a=_Yt3#zz!;|>EvVLFmRNc3}QAt~ufq)JHEIs;4^c+aPThXpj+C&|pkK$}k^gANj zNRRtTDx`_lbPT~Z23aPeO(oaUL_FDgy}q^;b+l-y0=Z10O!ZBw+evyH^<0;tF`34} zi48fA`*FrOW?-fO0*b#`dY}4#hKpBQ(HL7_X$7_gw`;{+5L`(~7uv8yYN&ZRt&v4C zqB+hD9X{ASkRnsd**8aOjmH5jLJaCkvsG9 z@;M!adDtn4Eld!bBx0*lVL&)D8pGK@7!!B;^@^jU7vRkCDF~ols_o5!C?7vn(y~_e z0p`pVjbA#2n{XV^RtB!TJUzHxouIf`glGoGZORLw)d3wY>|FWg8in0<-P2eM2lW7_ z1;>_iTjji$%0Yti6z7NVwH7zJhW57*?jc;Z$4S^Djl+UPa4CeOO2hJ0#hunmwpwt* mfz{XyyW|GC;SdJ8F>&vWz5OqY4Ofa3pHYm30Ye~MV}AjyHAR&G diff --git a/grace_dl/torch/__pycache__/__init__.cpython-310.pyc b/grace_dl/torch/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index ac61f606d7c495eb828d3b8b71203a73ad526c96..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2858 zcmai0&2HO95ayB;Mai=3IJVQYNzEomTNSkg^wu^g?8<17!c`7-P(UcKpt&oVfk?W$ zlpPcDrS>EA0i2`J*XUF5+EdTD_0*XqDN_DH7TCcqXMbkC`DT{U;$l;S=Kb~O@b9{& z{ezQgGhwm?UG9Kj8Y5#ZAOjNU1D$9;X-sFvg~kj{&W(WqbCcCzUQ=@u<`%2NysqXo zX76e3#=mfq*49&F`-fh-Z;9v$rg0!fZ*DIFm0f7gAt}*Xy;^L zGJ}~H+Mvd2aJH3NzwjUurylMr8jf}7J?QUj5Ha|1PW~WmJ$d%(h`BL$;zQ^>l#y^w z{P-0*X}FHh3t-8aqPzjZa8wgJJCrvj$9_Y+rkG-b|!J`c7Noo7meM` z-Iu-X-R=mW;+dpu+>ImQ9d;@8LqDdpb99QFE$ut6EES6B*M}Z($1l}dkFYFR2 zU~4*Ql9s8ydhlVc;b~jY<+mUT)-f_LSWEE@c*lWp%u*b*U|eUm8rNBaHGz*dTL1(X z(`GLRAQH)UFMumz2oZFKz_S7xKO8x3$Q;J!zBnWQn1?wjJNtJtdKqk-lBq_usSXk{ zWKX*yeaNu7d(4G9;^|rz4N0Y+aOw(9B>>tsvIu?z2Z)bRl(4Dj0Pq>JRzUdr&w{Pc>-w&RN5^zap|~4s-fG0#}g4s$B)a*^CCd%P3(l6GtQhis(=FvJIUh$ zs#~*BAfE+>`G!ZuzI9Z70AkLYLi=8CHg{&hy0vs6PO0}Wp^AoM>^#FEImb%_L1-H=6`IBsOY2=yMH^-hwWlgD4Wa`kcU6Jt*+B zzm(QsUVo%6q+Y`F3?^r+;(`P-flMcN`I70g&v zD~_v_Y?h~dhU%vvk_S~wI^~M@8};dl3onP@9*1Icw^>(NrAX_Ckg;PfZ>46dde)aS z;~vf#S6y?rc^~5LhaiA<7~Iqf)U#f|gWrU@qU4351oEeY%EurQ%!^rsZZB0>Dd4{+ z%Zs03Fx6gbmjvpm{#HMOs`^^LG_u)m`k8TND!qwcP&YNE=Gz*9U6(Z&ser!#~53s`=g4h^Iel`%j18nhDM(Iv7%)=7)l zBw3&Hf41}n?pWD-xF693SMWIjR|*%1Xv25$If{9_(^gI$Wx_Xj1oFN`lKH4tSM_hC zf0poy&gvpn1ir%CHs`NYm#a8G0|oXx_3{2Nkra#>q8mKRpn{|9yqfN1~# diff --git a/grace_dl/torch/__pycache__/helper.cpython-310.pyc b/grace_dl/torch/__pycache__/helper.cpython-310.pyc deleted file mode 100644 index 8541fdca3fd37d5a718b4d7e330c741dceab7ce3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3310 zcmbuBOKcle6ox&vPj+oA4PAFzU429IlGbk^Ga%mvndM25@=_t$&N@K0 zxhc0zkw1d$1j#^l@ivk6(je0t5qS9)S~oNat%qks*2_T4DD5${7HIE5>*ejD!^=X; zD(!P{nh6LEF0;y|2j()Epld6W&1R zXD{0aKZEXoJE;5&f!(jzX|MXAKXWET#Xzy!o3q>9N7hrRc?-DcK z@KfugGn!?2%r)W;yM6A+M${X%uD}v8g5?q{5Aoe%k2eO(F|2#knum%=KwX78$M=ey zHy-*u%#OgB{Ca*u2E^;*d8Dn`t-cj&7jN+yr}B>5rkG;B(@zw+s3%(%slb zOc5s)L78xh9}!2r84xo~0&^v3uH5OxO)A*+SMt}ZrmW^mhFmK#}4;WRkTG#n$@Z;{kUaY zrk@a|u;pDp4Mo(fa?x<0B@ND{$!bNj5M!lcPV+MhrQ*4!aklPgSsG5+E*(+VPiG2A z^JcYNvZ^Hpqwez;me5(cxUj`E%4O-32XuXN#*g2u%pC`ry>MmjcoQnLT82}T28YzS z7&MzI@rzh|bEJ?!_eR-)(s9``&z7AnnqIN*;1Jb{x~@?Bi3Jtnw*xKv))V7?+wGcR zIW@6t)x_qz2uGn+8Wyv~Dy{0OYQC{$p4R5ZJD=Tlg4CLz6_QvJO-o-Fwm_6>qV&V} zbp3v&pgX3t5Jo!2u4`ytM0<-#(pfrAJmS^zy6MMCrc)TjPK)L;;{h>S--In^k?OLQ0zJJw z)5vd`Bywzfl2TjJ&v9XLdnn9vnR$UTDpr{#XkvS3K@rGy!lt&z>h&P;A+h5Qs4!$F zK|AaqtO?ok0xql$*-P9Gn+WPe_7k?lChGMe2MOFEV2whN!$j^7Fsc(d3drwXw4K*Q zg_~5CCOa>sEenHBov2WiqmpKfJP$ozSCN}|-qfLkMmQ!3(r+&ZpzFt0)|j8V zV@uAKRdL3t5d9c-=~GjvN|ZWSNgubW+$4PjtC~^;qY6T$yGmu1sED|dZ*7vl|eQ`%xVI5K**R=>hc*V5|m$!AIn`oU9Iz3uL4?Rj*4REJ74 zq#92>nu-Q<(r6mK10wrG4?Yza+C>9KkIGY6^kPm^PjzTpZYQDn`e|Xa8aH26d!$IG GBmV+olG%*_ diff --git a/grace_dl/torch/communicator/.DS_Store b/grace_dl/torch/communicator/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0>*%k5kU#Nf zdpPj(1g3fnf)h@2lHk|MX<;Q6McvNr!bzOMP29puJW9SL+~Mvu;jXZ*?8N8v73q5) zzyo>NrzSe7ieg%hhpet--!>hV=NYR{g*3ZsHD&6e9GWi5Q(2YcOEE+T+rwV|sE42k zl2A?(>m3MH{P{)a`)gg$QT#-4| zR9g$Ut+q8uElysM8~PGbchz(PX$NUKF2{A6n((}mIZqX&+joqkL_RX%uqq}}C{;=0 z6{4u*g~Y1gP}51%Va}9F(;M=)7<{jURD+DkvvK`1D>F8DR-Ko5#kd+gmE&5nXVbc7 zgXe;uR5^R};=7ZlF9w-pL($5V=L0C}@N@t%*EJrjt9mrKkR1pOP!oms+@T@8+*{Y; zXmyd{sC+MrttGIO;( zx3#;Vl)QB`dF{M*ZvS` z<5O1hTqyLsMeT>nh%pjJ#c({9tbDURE-6&~!#^#VZfFgw6;?tb-u*~+adU6waZ@kj z;N_EOwS%$br(m3Pny`Fae4XrW@1`s>OY`P=C=;~R65RO9s+imDi7jA^*Y3$%;Jg*L zF{iNU52=D{2ftKQ77srDL*x^oL?YHNQ%8}fCLoM7+1Ct$rZ3z775SBKa| z8k$;5n%-}AJlz4xN2tH{ zn`W65P+DPpb_P7mTI~D17EBL1(KN4(J*AwTlU?dfJd8d<+Xz*NHB>! zz{)G^l>q@(Ik)aLtNlRUg*CI^G~S~|LgPJZo|Uw4t9M$->a60+HGE>ti)!&h*IO|g VW;dh`9=GX#gU@{x*xIN2_P=v763+kt diff --git a/grace_dl/torch/communicator/__pycache__/allreduce.cpython-310.pyc b/grace_dl/torch/communicator/__pycache__/allreduce.cpython-310.pyc deleted file mode 100644 index 4c2ae81f54efcf50c6db69906863060720c12274..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1111 zcmZuvy>HV%6u%G0P1B_6M+6dLU^NnPHyEmlDithM87dZw$~t`4#Fgz!@6IWWoGJYW zkdRp0k$>q{CUzz!-r1xLh?Cy)^WFRW{hsXgItb3suV2nz5`@0HpVi~R&ub992Z|$( zGnC^9V`r0$-k5t>tjD!jl)=xp}`nS1kAMQ^boRjWc5^unu%X;t0=KSm3B2P0ql6@{Vj zN#NUcQIxq*%n0kV$wU-fxoWnP$b6!N)>7FuHU*E7uv8lbF`gFLx!p0MfL9$aJq5R3 z!E#~O#;o9(&=#k5yYX6a`i9gtRQM&r_gW|&j+mOJ<|8XcEIg7|MJ5^7;h{>6Vn?Mh zEPNyQP-g7K@w?&SaX3;e5si*{76K2+ID{VVX$kK+8BDHir{P7zWNQwxHYg8o;b+*x zv;IBCg9o?2;C6fCq=wtE*05Cp*b6+T05H166|Kk}0Z8ddg288b9NjMX6h!wyjZlTo z{5fv;`b;h%s752e$~nSDk-?tD=}?A9TUZ6}U%$;-^y-8jB1S=PL$U045!Q*I|#l$%<$KsnKF>D#DHx_*7< z7PnvT!JyW#2ZMSKl*7gL#!|_t+FwAuIyN# z4uuTl+&6HJIrepUEl{AZ&{JntT3c;OfU~pwnHhfHM^-yKeFE)=KYu;@oe}Z}{%kiG zBi}$b&%j6`X-*27P>Q~ndxf9)1xpwu-x29cc0(jnbmb?33{FWD{sp^8MBU)1DvEiT zrJ}BM{aTb05g%7qWe$*-__@w%Esp1PE#hyLJgRc>^~>*%p1+JITBK@i zxy)lgGM&bd^A6bf4%PAOT6Z8dc7x{}(mq{0yu)JL9wcqCg*60=ML8TnH@|?HkfmP} zOU~(v9+Ov}FByykwP)A=`#*cjaM`iJtzR?SvCMjPVA+c9gHZMf_#yf$|2+uiV%Pe& zz6~0`V|{?uJ8_gpp$jx%pc<@PPnGjRX;CW|irGw+Qe%nEH?@wsx`+P5T9p8@;Vs}w zI#!B8xz1FSGFQg+bXCuMs1##LApy;2Z6r#7y0|yyDg>FV*CP3h;a7lqkpzSyB!aH4` zHKH09!gch<&m5h8gu+3dnHn)Z+{a)Tc|;ap-kdEz`YN^tXaG)n96Rjr5az|VO-5dHh3~S8i{Iq_Xj9RYmLnxV`V%}u>F)n+wq4Qr0 zT8{7X$p4;huw6Xjv|+gmJP(260ewOjPyYX`@UFm9fQrMXfb*h~^BnyF=NEI4uVcFDl&vb|n2dJ=`nuz3OC?vIH1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0zL#<^=pnGH?_Xzk1fe?{?3xe3TUhEPIF2~Z z(bVoF$4fFL7`;Q>buaH)!f}p9io9yaSkBvv>SqG~VK^U$y6m<#{yAOQy489#v&l=x8n(&qSeB zq|0bDJ~+?xFtrt1^Wb3D1eS`x)p&t6q$Ue2`!&8L+FSVF@CMhoCNlyI@cPA3`MF4$ zvZxm#(xQOAb_&FS3AnzlgxSv&Z5dI=$oMW1Xi(z9bX{^Zjy>ZmknNC)>xh}Td5)_T~!Os!Jd` z!JTf6>)S7S+vGmPu16nw>|r0Tpa0Y2_U&ohF(IV|TM9}|kJ4qySGhg+DZN~={7w SeBJPQgDt2<4pE;x@qPpGH1|(-C#u?ZHc=RzhXlE#>MMT z1dCfR>=$4ZQM4i>oLNO{He!@0uA--8#P1RntK=zBiDoB!6e;>M>8CH@D{|1MzPI1j zGo!6-O~G(;^CU3Cy<{JT{T@t5PRSWr;Y?v>jw$MR$2z)4PUs0+aTguMuqH=|V$Yc( z9sh%UKvq#lx(MdDi@JEi;C6oSlER5_d!qDlS?GT1(@7KZ+14k=vYKoAf|$$j>R&nh z#cE@RQ)!M%H!oE(0q)qY#a&+q-XfBW8WYGk3;Hdoa! zL^a&r-I^iB*Vc~)vxU#>Od4BG78r}Y04Aduy-Kfx&8WEw8wHFv9==%xLNystMMePe zh(99}s+i(WGaxTgF^~#`subqr8Hva!1xPc0<$k#ThjTuc%})z=*fydn$3~h`Tk);5 zT8XWT7%v1MI%!Q^HdC<)x-1aJMtoKd^x&qj8eoLCQUrKKI}z?s3t70iti+lFVauf+ zyrCb|5)Mqu-T>psNl*?blXfHr-GJqaK=ACq1U*DPwsQ!`XL(*WrOR`_JFVKWta9u0 z%pj0`<|C`CNm#s~ekae1N?M!efn9S2e9XIG5M+z^_h>?Uba~?~d11Vwu*w}>1ldKI(nL& zkqKWVT|A{-63QT2J_IF-4N2?pUgPTi!Cm26u~it7HPwwq76KhrX&th$kvaN~wE*!p zwl1NR)WAn`H~9+6EYnTVB8vty?eBPQ=e3W=3#YAl54Ow>7{6P~21w1V)+&fJfV?iK z2r2HT=6&?uMenGQwf6MTT*9q)aEll2$(jbNw7`YWwh7>up%(V|B_QN7y~_TdmhA^` z(2~K|$mTwtatlny9`F?pAkznQl{j)lp~ECc3D1W6Uf7SiUxZcv<)adr%mpo<5EBFX8~=W zu8Yi?kI*xq9eP|GZd)UNZS9k*R~Z3?frN7=dgm|kB#VDAJ(fUMLOzxszFi>aA3%eF zqD01_D0_Kcw`yMD?+bbUJ7{L}Ce3rz7Ep^=f#zd0H^KOen<_1IRc$LZ1a=)=Ld{-- zUnrswRz@Ovfo1XKU4#4mIKVdA3q5!cEcD<(=)nft%v!GK`&!oLsb^oW1O5j4j72SF K1kUGsJO2UMM(H5{ diff --git a/grace_dl/torch/compressor/__pycache__/topk.cpython-310.pyc b/grace_dl/torch/compressor/__pycache__/topk.cpython-310.pyc deleted file mode 100644 index 0c51e8cc84c04a7651995e674f5d74b0adaecdfb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1654 zcmZ`(&2Qv16!%AxNk?t9uw{1XQph6i1c3ko}caae*QhzTU&hs-{Zf2AN>&!@+TUv zf8ZnEf|##BNg`=UhWKYCovw2MHv78O#CGMJQPdo+YQG4FFeum~t0NJ8P7SN-?Xa z)NLk=}rvU3j5%ZS(^e>xZZwp*J=;Vz09_biEEyqhp+Y6V)3SbHDRvQmqSy zyF)ZPwSzHzsn(TmgKcn&*w-CU{T>)O$gPjBEb}=G6d6)UhU^6y0oNqEih<1?tAiD* z!xtnV!w4$wx$Ph8=KKF)F)|tj_cxfEC6{DDmjv*lEys0y29 z8dFc13ydm9-ngc2Bg^tq7z5M3X53PL2ns4IT@VV8{<+%@kyM}Gi;Y2ex8lk)ciNfs9>4fRJbtG7V8pb`_M-@zfz z?en`=6t8a&BMj1RlimYb{wMWy+^y6x{wDQz!D!j4-LV7?1wgAWC^S7jHuU1>{TsF< zN3 lx0zk6E$h$n?_PQHYtpz_;&Q;J=YTIG_lEn`N$_ zWs_P?OB`=z*>6)(b~Bz6`cpjgGf-}8#hYA}<(fO5Kp$g>XVLpG!Z)Go7P86W;9jWF y3zhk%-SO|vuGjGH?D}^{BfUF1U}yOSuN8X3QWt;SrTPs%wB2KbhwwPw>HP!fCy0pv diff --git a/grace_dl/torch/memory/.DS_Store b/grace_dl/torch/memory/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0{VkloK>ZkU-aLW0@Ua5@~lGY=`3_8pQn> z{*qfHBx*V;X1p8;5+lv5$MgA~*B%{>8OodQpRb;B#=enrC^4(=2tB*y>VVHwfnFR?{vS^^4wnct<~V=nOg^^&$l6H`NF{5 zw))A%>-pJ*TsqyD!76BFu&!CjE)gMn#4_95#BpQ04K9a^j&R^lA|%gt5BFGS`;4Z* z=$+8{AJ{;!vGf4;9_AuJsR~C^9M!seXomTihU~iDXm!L}3jVKCET0VvJSW1^KQdtt#BWE?vM2xm diff --git a/grace_dl/torch/memory/__pycache__/residual.cpython-310.pyc b/grace_dl/torch/memory/__pycache__/residual.cpython-310.pyc deleted file mode 100644 index 676a08aa863aa5f09d4c9691cbc0047043330a9d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1093 zcmZuw%T5$Q6us52Y1#o_gNQ3)qHz{Z{{T@4L^p^DhQ!RmPSI5kO}eMCst07!tmGHm z>ym%*3u^1axHEC*%6qHnFrbyxsn@Mj_nf|6&d*1H?d|W+@4qIZ=mo0#3<-96TFf|sZLT_&NdepYuNQ;1PMbX0I$wj zDmHY5C;T%QdLj@V9DNZYX=E05q|U^I=MC!*yo>GE^OWe?MsrjE8`$+dgbZp(pk|1y zb`Tu_NXNF%T{EXH7`5^CWW|j?;zhyxuJN?Y->VMZS0TbVC^G2JyLeYirQ;D-M_IMU z$0HtZmdE3~y7oTc%*nL8%#tVSEWk#bDR)r{PjKH5DLypk)`K6W|QrCK@8s(rg|Gt>h`dG%d2hl2^4IOC0ZA+0|R+@QlK zEsI0sK~8ecYKfc&%wyB#Yg#TNQ&QYE@}26=s(FZ$+Wi8xSNmU`(*VC`a34-Pb;l}m zRO SMiZ_!**s%JOWa-Pxqkut8U-8x diff --git a/patch_files/.DS_Store b/patch_files/.DS_Store index 695a77812ec27a7da4b88b14093c1910fbf7b2d9..7c8bd46e92c329f2e7b3f2d9be9f7a15c27f34c2 100644 GIT binary patch delta 76 zcmZoMXfc=|#>B)qu~2NHo+2aj#DLw41(+BanJ4owzT`|PPR>cn&(C4l{FsrGbu&8$ eKL=3lW=5v(%#-;=EIAk%7=Va@VRL}U7G?kkyb@FZ delta 68 zcmZoMXfc=|#>B`mu~2NHo+2aD#DLwC4MbQb^D{l$EWrGoWn+Uh<7Rdaeh#3T&5F$5 WnJ4p$SaL7`0V4wg)8+t?EzAIYqY$A0 diff --git a/patch_files/horovod/.DS_Store b/patch_files/horovod/.DS_Store index c3bc681206f083871f7dd589fe1077b0a8f8ec98..5f8c5e7ffad68965e260075bb66a427e32822fe8 100644 GIT binary patch delta 94 zcmZoMXfc=|#>B)qu~2NHo}w^20|Nsi1A_nqLq<}0aY0f}e$vLmm5hw+lTBD3a;6j~ v=OpFl=P)d0XJgvfAjrI#or9kPsC%;@$9Lw*{34DVK>Z*CST;w9tYHQKS1%S* delta 380 zcmZoMXfc=|#>B!ku~2NHo}w@x0|Nsi1A_nqLvc>JVQ_MOZo%e>%qyAeK@zMC*$k-+ zMGT1uNuVeY$a?|_B-z}27nh`*{3M_#M;c4oOFqeCj_4{<2&gE?fLIUI2L$yBAd-ur z1n7V~hGK?%pfl1Ka)5L>%$*wFu{d)oMe0fK@T7OD`}? p=3#ocnT7ci<7Rdaehy&R0z>dS^JIPzM-E`nGJ#yVIYML&GXRHKREYop diff --git a/patch_files/horovod/_keras/__init__.py b/patch_files/horovod/_keras/__init__.py deleted file mode 100644 index a24222e..0000000 --- a/patch_files/horovod/_keras/__init__.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright 2017 Uber Technologies, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from distutils.version import LooseVersion - -import horovod.tensorflow as hvd -import tensorflow as tf -from horovod.tensorflow.gradient_aggregation import LocalGradientAggregationHelper -from horovod.tensorflow.gradient_aggregation_eager import LocalGradientAggregationHelperEager -from horovod.tensorflow.mpi_ops import rank - - -_PRE_TF_2_4_0 = LooseVersion(tf.__version__) < LooseVersion('2.4.0') - - -def create_distributed_optimizer(keras, optimizer, name, device_dense, device_sparse, - grace, compression, sparse_as_dense, gradient_predivide_factor, - op, backward_passes_per_step=1, - average_aggregated_gradients=False, - num_groups=0): - class _DistributedOptimizer(keras.optimizers.Optimizer): - _HAS_AGGREGATE_GRAD = True - - def __init__(self, **kwargs): - self._name = name or "Distributed%s" % self.__class__.__base__.__name__ - self._aggregated_gradients = False - - self._allreduce_grads = hvd._make_allreduce_grads_fn( - self._name, - device_dense, - device_sparse, - grace, - compression, - sparse_as_dense, - op, - gradient_predivide_factor, - num_groups) - - self._agg_helper = None - if backward_passes_per_step > 1: - if hvd._executing_eagerly(): - self._agg_helper = LocalGradientAggregationHelperEager( - backward_passes_per_step=backward_passes_per_step, - allreduce_func=self._allreduce_grads, - sparse_as_dense=sparse_as_dense, - average_aggregated_gradients=average_aggregated_gradients, - ) - else: - self._agg_helper = LocalGradientAggregationHelper( - backward_passes_per_step=backward_passes_per_step, - allreduce_func=self._allreduce_grads, - sparse_as_dense=sparse_as_dense, - average_aggregated_gradients=average_aggregated_gradients, - rank=rank(), - optimizer_type=LocalGradientAggregationHelper._OPTIMIZER_TYPE_KERAS, - ) - - super(self.__class__, self).__init__(**kwargs) - - def get_gradients(self, loss, params): - """ - Compute gradients of all trainable variables. - - See Optimizer.get_gradients() for more info. - - In DistributedOptimizer, get_gradients() is overriden to also - allreduce the gradients before returning them. - """ - grads = super(self.__class__, self).get_gradients(loss, params) - gradients = zip(grads, params) - return self._allreduce(gradients) - - def _aggregate_gradients(self, grads_and_vars): - grads, vars = list(zip(*grads_and_vars)) - aggregated_grads = self._allreduce(grads) - if _PRE_TF_2_4_0: - # Prior to TF 2.4.0, this function was expected to return only a list of - # grads, not a list of (grad, var) tuples. - return aggregated_grads - return list(zip(aggregated_grads, vars)) - - def _allreduce(self, grads): - self._aggregated_gradients = True - - if self._agg_helper: - return self._agg_helper.compute_gradients(tuple(grads)) - else: - return self._allreduce_grads(grads) - - def apply_gradients(self, *args, **kwargs): - if self._agg_helper: - if isinstance(args[0], zip): - # If grad_and_vars are passed in as a zip object - # convert to a list. This is necessary for TF2.4+ - # b/c args[0] is used in both conditional branches - # inside _agg_helper.apply_gradients(). - args = list(args) - args[0] = list(args[0]) - args = tuple(args) - - results = self._agg_helper.apply_gradients( - lambda: super(self.__class__, self).apply_gradients(*args, **kwargs), - self, - *args, - **kwargs, - ) - else: - results = super(self.__class__, self).apply_gradients(*args, **kwargs) - - if not self._aggregated_gradients: - raise Exception('`apply_gradients()` was called without a call to ' - '`get_gradients()` or `_aggregate_gradients`. If you\'re ' - 'using TensorFlow 2.0, please specify ' - '`experimental_run_tf_function=False` in `compile()`.') - - return results - - # We dynamically create a new class that inherits from the optimizer that was passed in. - # The goal is to override get_gradients() method with an allreduce implementation. - # This class will have the same name as the optimizer it's wrapping, so that the saved - # model could be easily restored without Horovod. - cls = type(optimizer.__class__.__name__, (optimizer.__class__,), - dict(_DistributedOptimizer.__dict__)) - - return cls.from_config(optimizer.get_config()) - - -def _eval(backend, op_or_result): - if hvd._executing_eagerly(): - return op_or_result - else: - return backend.get_session().run(op_or_result) - - -if hasattr(hvd, 'broadcast_global_variables'): - def broadcast_global_variables(backend, root_rank): - return _eval(backend, hvd.broadcast_global_variables(root_rank)) - - -def allreduce(backend, value, name, average, prescale_factor, postscale_factor, op, grace, compression): - return _eval(backend, hvd.allreduce(tf.constant(value, name=name), average=average, - prescale_factor=prescale_factor, - postscale_factor=postscale_factor, - op=op, grace=grace, compression=compression)) - - -def allgather(backend, value, name): - return _eval(backend, hvd.allgather(tf.constant(value, name=name))) - - -def broadcast(backend, value, root_rank, name): - return _eval(backend, hvd.broadcast(tf.constant(value, name=name), root_rank)) - - -def load_model(keras, wrap_optimizer, optimizer_modules, filepath, custom_optimizers, custom_objects): - horovod_objects = { - subclass.__name__.lower(): wrap_optimizer(subclass) - for subclass in keras.optimizers.Optimizer.__subclasses__() - if subclass.__module__ in optimizer_modules - } - - if custom_optimizers is not None: - horovod_objects.update({ - cls.__name__: wrap_optimizer(cls) - for cls in custom_optimizers - }) - - if custom_objects is not None: - horovod_objects.update(custom_objects) - - return keras.models.load_model(filepath, custom_objects=horovod_objects) diff --git a/patch_files/horovod/keras/__init__.py b/patch_files/horovod/keras/__init__.py deleted file mode 100644 index 9cbcafe..0000000 --- a/patch_files/horovod/keras/__init__.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright 2017 Uber Technologies, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import keras -import keras.backend as K - -from horovod.tensorflow import init -from horovod.tensorflow import shutdown -from horovod.tensorflow import size -from horovod.tensorflow import is_initialized, start_timeline, stop_timeline -from horovod.tensorflow import local_size -from horovod.tensorflow import rank -from horovod.tensorflow import local_rank -from horovod.tensorflow import mpi_threads_supported, mpi_enabled, mpi_built -from horovod.tensorflow import gloo_enabled, gloo_built -from horovod.tensorflow import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built -from horovod.tensorflow import Average, Compression, Sum - - -from horovod.keras import callbacks, elastic -import horovod._keras as _impl - - -def DistributedOptimizer(optimizer, name=None, - device_dense='', device_sparse='', - grace=None, - compression=Compression.none, - sparse_as_dense=False, - gradient_predivide_factor=1.0, - op=Average, - num_groups=0): - """ - An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to - average gradient values before applying gradients to model weights. - - Args: - optimizer: Optimizer to use for computing gradients and applying updates. - name: Optional name prefix for the operations created when applying - gradients. Defaults to "Distributed" followed by the provided - optimizer type. - device_dense: Device to be used for dense tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_OPERATIONS. - device_sparse: Device to be used for sparse tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_OPERATIONS. - compression: Compression algorithm used to reduce the amount of data - sent and received by each worker node. Defaults to not - using compression. - sparse_as_dense: Treat all sparse gradients as dense tensors. This can - help improve performance and memory utilization if - the original sparse gradient has high density. - Defaults to false. - gradient_predivide_factor: gradient_predivide_factor splits the averaging - before and after the sum. Gradients are scaled by - 1.0 / gradient_predivide_factor before the sum and - gradient_predivide_factor / size after the sum. - op: The reduction operation to use when combining gradients across - different ranks. Defaults to Average. - num_groups: Number of groups to assign gradient allreduce ops to for explicit - grouping. Defaults to no explicit groups. - """ - if gradient_predivide_factor != 1.0 and rocm_built(): - raise ValueError('gradient_predivide_factor not supported yet with ROCm') - - if op != Average and op != Sum: - raise ValueError('op currently only supports Average and Sum') - - return _impl.create_distributed_optimizer( - keras=keras, - optimizer=optimizer, - name=name, - device_dense=device_dense, - device_sparse=device_sparse, - grace=grace, - compression=compression, - sparse_as_dense=sparse_as_dense, - gradient_predivide_factor=gradient_predivide_factor, - op=op, - num_groups=num_groups, - ) - - -def broadcast_global_variables(root_rank): - """Broadcasts all global variables from root rank to all other processes. - - Arguments: - root_rank: Rank of the process from which global variables will be broadcasted - to all other processes. - """ - return _impl.broadcast_global_variables(K, root_rank) - - -def allreduce(value, name=None, average=True, prescale_factor=1.0, postscale_factor=1.0): - """ - Perform an allreduce on a tensor-compatible value. - - Arguments: - value: A tensor-compatible value to reduce. - The shape of the input must be identical across all ranks. - name: Optional name for the constants created by this operation. - average: If True, computes the average over all ranks. - Otherwise, computes the sum over all ranks. - prescale_factor: Multiplicative factor to scale tensor before allreduce. - postscale_factor: Multiplicative factor to scale tensor after allreduce. - """ - return _impl.allreduce(K, value, name, average, prescale_factor, postscale_factor) - - -def allgather(value, name=None): - """ - Perform an allgather on a tensor-compatible value. - - The concatenation is done on the first dimension, so the input values on the - different processes must have the same rank and shape, except for the first - dimension, which is allowed to be different. - - Arguments: - value: A tensor-compatible value to gather. - name: Optional name prefix for the constants created by this operation. - """ - return _impl.allgather(K, value, name) - - -def broadcast(value, root_rank, name=None): - """ - Perform a broadcast on a tensor-compatible value. - - Arguments: - value: A tensor-compatible value to reduce. - The shape of the input must be identical across all ranks. - root_rank: Rank of the process from which global variables will be - broadcasted to all other processes. - name: Optional name for the constants created by this operation. - """ - return _impl.broadcast(K, value, root_rank, name) - - -def load_model(filepath, custom_optimizers=None, custom_objects=None, grace=None, compression=Compression.none): - """ - Loads a saved Keras model with a Horovod DistributedOptimizer. - - The DistributedOptimizer will wrap the underlying optimizer used to train - the saved model, so that the optimizer state (params and weights) will - be picked up for retraining. - - By default, all optimizers in the module `keras.optimizers` will be loaded - and wrapped without needing to specify any `custom_optimizers` or - `custom_objects`. - - Arguments: - filepath: One of the following: - - string, path to the saved model, or - - h5py.File object from which to load the model - custom_optimizers: Optional list of Optimizer subclasses to support - during loading. - custom_objects: Optional dictionary mapping names (strings) to custom - classes or functions to be considered during deserialization. - compression: Compression algorithm used to reduce the amount of data - sent and received by each worker node. Defaults to not - using compression. - - Returns: - A Keras model instance. - - Raises: - ImportError: If h5py is not available. - ValueError: In case of an invalid savefile. - """ - def wrap_optimizer(cls): - return lambda **kwargs: DistributedOptimizer(cls(**kwargs), grace=grace, compression=compression) - optimizer_modules = {keras.optimizers.Optimizer.__module__} - return _impl.load_model(keras, wrap_optimizer, optimizer_modules, filepath, custom_optimizers, custom_objects) diff --git a/patch_files/horovod/tensorflow/__init__.py b/patch_files/horovod/tensorflow/__init__.py deleted file mode 100644 index b616e28..0000000 --- a/patch_files/horovod/tensorflow/__init__.py +++ /dev/null @@ -1,761 +0,0 @@ -# Copyright 2016 The TensorFlow Authors. All Rights Reserved. -# Modifications copyright (C) 2019 Uber Technologies, Inc. -# Modifications copyright Microsoft -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -# pylint: disable=g-short-docstring-punctuation - -import os -import warnings - -from horovod.common.util import check_extension, gpu_available, split_list - -check_extension('horovod.tensorflow', 'HOROVOD_WITH_TENSORFLOW', __file__, 'mpi_lib') - -from horovod.tensorflow import elastic -from horovod.tensorflow.compression import Compression -from horovod.tensorflow.functions import allgather_object, broadcast_object, broadcast_object_fn, broadcast_variables -from horovod.tensorflow.mpi_ops import allgather, broadcast, _allreduce, _grouped_allreduce, alltoall -from horovod.tensorflow.mpi_ops import init, shutdown -from horovod.tensorflow.mpi_ops import is_initialized, start_timeline, stop_timeline -from horovod.tensorflow.mpi_ops import size, local_size, rank, local_rank, is_homogeneous -from horovod.tensorflow.mpi_ops import rank_op, local_rank_op, size_op, local_size_op -from horovod.tensorflow.mpi_ops import mpi_threads_supported, mpi_enabled, mpi_built -from horovod.tensorflow.mpi_ops import gloo_enabled, gloo_built -from horovod.tensorflow.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built -from horovod.tensorflow.mpi_ops import Average, Sum, Adasum -from horovod.tensorflow.mpi_ops import handle_average_backwards_compatibility, check_num_rank_power_of_2 -from horovod.tensorflow.util import _executing_eagerly, _make_subgraph, _cache -from horovod.tensorflow.mpi_ops import join -from horovod.tensorflow.sync_batch_norm import SyncBatchNormalization -from horovod.tensorflow.gradient_aggregation import LocalGradientAggregationHelper - -import tensorflow as tf - -# @DEKHTIARJonathan: Do not remove, this fixes issues: -# - https://github.com/tensorflow/tensorflow/issues/38516 -# - https://github.com/tensorflow/tensorflow/issues/39894 -if tf.__version__.startswith('2.2.'): - from tensorflow.python.keras.mixed_precision.experimental import device_compatibility_check - device_compatibility_check.log_device_compatibility_check = lambda policy_name, skip_local: None - - -def allreduce(tensor, average=None, device_dense='', device_sparse='', - grace=None, - compression=Compression.none, op=None, - prescale_factor=1.0, postscale_factor=1.0, - name=None): - """Perform an allreduce on a tf.Tensor or tf.IndexedSlices. - - This function performs a bandwidth-optimal ring allreduce on the input - tensor. If the input is an tf.IndexedSlices, the function instead does an - allgather on the values and the indices, effectively doing an allreduce on - the represented tensor. - - Arguments: - tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce. - The shape of the input must be identical across all ranks. - average: - .. warning:: .. deprecated:: 0.19.0 - - Use `op` instead. Will be removed in v0.21.0. - - device_dense: Device to be used for dense tensors. Uses GPU by default - if Horovod was built with HOROVOD_GPU_OPERATIONS. - device_sparse: Device to be used for sparse tensors. Uses GPU by default - if Horovod was built with HOROVOD_GPU_OPERATIONS. - compression: Compression algorithm used to reduce the amount of data - sent and received by each worker node. Defaults to not - using compression. - op: The reduction operation to combine tensors across different ranks. - Defaults to Average if None is given. - prescale_factor: Multiplicative factor to scale tensor before allreduce. - postscale_factor: Multiplicative factor to scale tensor after allreduce. - name: A name of the allreduce operation - - Returns: - A tensor of the same shape and type as `tensor`, summed across all - processes. - """ - op = handle_average_backwards_compatibility(op, average) - - if rank() == 0: - print(tensor) - if isinstance(tensor, tf.IndexedSlices): - # TODO: Need to fix this to actuall call Adasum - if op == Adasum: - raise NotImplementedError('The Adasum reduction does not currently support sparse tensors. As a ' - 'workaround please pass sparse_as_dense=True to DistributedOptimizer') - with tf.device(device_sparse): - # print("==Debug== this model has a sparse gradient") - # For IndexedSlices, do two allgathers instead of an allreduce. - horovod_size = tf.cast(size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else size(), - dtype=tensor.values.dtype) - values = allgather(tensor.values) - indices = allgather(tensor.indices) - - # To make this operation into an average, divide allgathered values by - # the Horovod size. - new_values = (values / horovod_size) if op == Average else values - return tf.IndexedSlices(new_values, indices, - dense_shape=tensor.dense_shape) - else: - average_in_framework = False - if rocm_built(): - # For ROCm, perform averaging at framework level - average_in_framework = op == Average or op == Adasum - op = Sum if op == Average else op - - with tf.device(device_dense): - horovod_size = tf.cast(size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else size(), - dtype=tensor.dtype) - - if grace and op != Adasum: - # print("==Debug== grace is called") - new_tensor = grace.step(tensor, name=name) - else: - name = None - tensor_compressed, ctx = compression.compress(tensor) - summed_tensor_compressed = _allreduce(tensor_compressed, op=op, - prescale_factor=prescale_factor, - postscale_factor=postscale_factor, - name=name) - summed_tensor = compression.decompress(summed_tensor_compressed, ctx) - if op == Adasum: - if 'CPU' not in tensor.device and gpu_available('tensorflow'): - if nccl_built(): - if not is_homogeneous: - raise NotImplementedError( - 'Running GPU Adasum on heterogeneous cluster is not supported yet.') - elif not check_num_rank_power_of_2(int(size() / local_size())): - raise NotImplementedError( - 'Running GPU Adasum with non-power of 2 nodes is not supported yet.') - if rocm_built(): - horovod_local_size = tf.cast(local_size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else local_size(), - dtype=tensor.dtype) - new_tensor = summed_tensor / horovod_local_size - else: - new_tensor = summed_tensor - else: - warnings.warn('Adasum reduction does not currently support GPU reduction using MPI. Tensors ' - 'are copied to CPU memory instead. To use Adasum for GPU reduction, please ' - 'compile Horovod with HOROVOD_GPU_OPERATIONS=NCCL.') - new_tensor = summed_tensor - else: - if not check_num_rank_power_of_2(size()): - raise NotImplementedError('Running Adasum with non-power of 2 ranks is not supported yet.') - new_tensor = summed_tensor - else: - if rocm_built(): - new_tensor = (summed_tensor / horovod_size) if average_in_framework else summed_tensor - else: - new_tensor = summed_tensor - return new_tensor - -def grouped_allreduce(tensors, average=None, device_dense='', device_sparse='', - grace=None, compression=Compression.none, op=None, - prescale_factor=1.0, postscale_factor=1.0): - if not tensors: - return tensors - - op = handle_average_backwards_compatibility(op, average) - - average_in_framework = False - if rocm_built(): - # For ROCm, perform averaging at framework level - average_in_framework = op == Average or op == Adasum - op = Sum if op == Average else op - - if any(isinstance(t, tf.IndexedSlices) for t in tensors): - # TODO: Need to fix this to actuall call Adasum - if op == Adasum: - raise NotImplementedError('The Adasum reduction does not currently support sparse tensors. As a ' - 'workaround please pass sparse_as_dense=True to DistributedOptimizer') - with tf.device(device_sparse): - new_values = [] - for tensor in tensors: - # For IndexedSlices, do two allgathers instead of an allreduce. - horovod_size = tf.cast(size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else size(), - dtype=tensor.values.dtype) - values = allgather(tensor.values) - indices = allgather(tensor.indices) - - # To make this operation into an average, divide allgathered values by - # the Horovod size. - new_values += (values / horovod_size) if op == Average else values - return [tf.IndexedSlices(x, indices, - dense_shape=t.dense_shape) for x,t in zip(new_values, tensors)] - else: - with tf.device(device_dense): - tensors_compressed, ctxs = zip(*[compression.compress(tensor) for tensor in tensors]) - summed_tensors_compressed = _grouped_allreduce(tensors_compressed, op=op, - prescale_factor=prescale_factor, - postscale_factor=postscale_factor) - summed_tensors = [compression.decompress(t, ctx) for t, ctx in zip(summed_tensors_compressed, ctxs)] - if op == Adasum: - tensor = summed_tensors[0] - if 'CPU' not in tensor.device and gpu_available('tensorflow'): - if nccl_built(): - if not is_homogeneous: - raise NotImplementedError( - 'Running GPU Adasum on heterogeneous cluster is not supported yet.') - elif not check_num_rank_power_of_2(int(size() / local_size())): - raise NotImplementedError( - 'Running GPU Adasum with non-power of 2 nodes is not supported yet.') - if rocm_built(): - new_tensors = [] - for tensor in summed_tensors: - horovod_local_size = tf.cast(local_size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else local_size(), - dtype=tensor.dtype) - new_tensors += tensor / horovod_local_size - else: - new_tensors = summed_tensors - else: - warnings.warn('Adasum reduction does not currently support GPU reduction using MPI. Tensors ' - 'are copied to CPU memory instead. To use Adasum for GPU reduction, please ' - 'compile Horovod with HOROVOD_GPU_OPERATIONS=NCCL.') - new_tensors = summed_tensors - else: - if not check_num_rank_power_of_2(size()): - raise NotImplementedError('Running Adasum with non-power of 2 ranks is not supported yet.') - new_tensors = summed_tensors - else: - if rocm_built(): - new_tensors = [] - for tensor in summed_tensors: - horovod_size = tf.cast(size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else size(), - dtype=tensor.dtype) - new_tensors += (tensor / horovod_size) if average_in_framework else tensor - else: - new_tensors = summed_tensors - return new_tensors - -def _allreduce_cond(tensor, *args, **kwargs): - def allreduce_fn(): - return allreduce(tensor, *args, **kwargs) - - def id_fn(): - return tensor - - return tf.cond((size_op() > 1) if int(os.environ.get("HOROVOD_ELASTIC", 0)) else tf.convert_to_tensor(size() > 1), - allreduce_fn, id_fn) - -def _grouped_allreduce_cond(tensors, *args, **kwargs): - def allreduce_fn(): - return grouped_allreduce(tensors, *args, **kwargs) - - def id_fn(): - return tensors - - return tf.cond((size_op() > 1) if int(os.environ.get("HOROVOD_ELASTIC", 0)) else tf.convert_to_tensor(size() > 1), - allreduce_fn, id_fn) - - -try: - _global_variables = tf.compat.v1.global_variables -except AttributeError: - try: - _global_variables = tf.global_variables - except AttributeError: - _global_variables = None - -if _global_variables is not None: - def broadcast_global_variables(root_rank): - """Broadcasts all global variables from root rank to all other processes. - - **NOTE:** deprecated in TensorFlow 2.0. - - Arguments: - root_rank: rank of the process from which global variables will be broadcasted - to all other processes. - """ - if _executing_eagerly(): - raise RuntimeError( - "hvd.broadcast_global_variables() does not support eager execution. " - "Please use `hvd.broadcast_variables()` instead." - ) - - return broadcast_variables(_global_variables(), root_rank) - -try: - _get_default_graph = tf.compat.v1.get_default_graph -except AttributeError: - try: - _get_default_graph = tf.get_default_graph - except AttributeError: - _get_default_graph = None - -try: - _SessionRunHook = tf.estimator.SessionRunHook -except AttributeError: - try: - _SessionRunHook = tf.train.SessionRunHook - except AttributeError: - _SessionRunHook = None - -if _SessionRunHook is not None and _get_default_graph is not None: - class BroadcastGlobalVariablesHook(_SessionRunHook): - """ - SessionRunHook that will broadcast all global variables from root rank - to all other processes during initialization. - - This is necessary to ensure consistent initialization of all workers when - training is started with random weights or restored from a checkpoint. - - **NOTE:** deprecated in TensorFlow 2.0. - """ - - def __init__(self, root_rank, device=''): - """Construct a new BroadcastGlobalVariablesHook that will broadcast all - global variables from root rank to all other processes during initialization. - - Args: - root_rank: - Rank that will send data, other ranks will receive data. - device: - Device to be used for broadcasting. Uses GPU by default - if Horovod was built with HOROVOD_GPU_OPERATIONS. - """ - super(BroadcastGlobalVariablesHook, self).__init__() - self.root_rank = root_rank - self.bcast_op = None - self.device = device - - def begin(self): - if not self.bcast_op or self.bcast_op.graph != _get_default_graph(): - with tf.device(self.device): - self.bcast_op = broadcast_global_variables(self.root_rank) - - def after_create_session(self, session, coord): - session.run(self.bcast_op) - - -@_cache -def _make_allreduce_grads_fn(name, device_dense, device_sparse, - grace, compression, sparse_as_dense, op, gradient_predivide_factor, - num_groups): - if op == Average: - # Split average operation across pre/postscale factors - # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. - prescale_factor = 1.0 / gradient_predivide_factor - postscale_factor = gradient_predivide_factor - else: - prescale_factor = 1.0 - postscale_factor = 1.0 - - def allreduce_grads(gradients): - grads, vars = zip(*gradients) - with tf.name_scope(name + "_Allreduce"): - if sparse_as_dense: - grads = [tf.convert_to_tensor(grad) - if grad is not None and isinstance(grad, tf.IndexedSlices) - else grad for grad in grads] - - if num_groups > 0: - grads_clean = [grad for grad in grads if grad is not None] - grads_split = split_list(grads_clean, num_groups) - - reduce_ops = [] - for group in grads_split: - reduce_ops += _grouped_allreduce_cond(group, - device_dense=device_dense, - device_sparse=device_sparse, - grace=grace, - compression=compression, - op=op, - prescale_factor=prescale_factor, - postscale_factor=postscale_factor) - return reduce_ops - - return [_allreduce_cond(grad, - device_dense=device_dense, - device_sparse=device_sparse, - grace=grace, - compression=compression, - op=op, - prescale_factor=prescale_factor, - postscale_factor=postscale_factor, - name=var.name) - if grad is not None else grad - for (grad,var) in zip(grads, vars)] - - if _executing_eagerly(): - return _make_subgraph(allreduce_grads) - else: - return allreduce_grads - - -try: - # TensorFlow 2.x - _LegacyOptimizer = tf.compat.v1.train.Optimizer -except AttributeError: - try: - # TensorFlow 1.x - _LegacyOptimizer = tf.train.Optimizer - except AttributeError: - # Future TensorFlow versions - _LegacyOptimizer = None - -if _LegacyOptimizer is not None: - class _DistributedOptimizer(_LegacyOptimizer): - """An optimizer that wraps another tf.Optimizer, using an allreduce to - combine gradient values before applying gradients to model weights.""" - - def __init__(self, optimizer, name=None, use_locking=False, device_dense='', - device_sparse='', grace=None, compression=Compression.none, - sparse_as_dense=False, op=Average, gradient_predivide_factor=1.0, - backward_passes_per_step=1, average_aggregated_gradients=False, - num_groups=0): - if name is None: - name = "Distributed{}".format(type(optimizer).__name__) - super(_DistributedOptimizer, self).__init__(name=name, use_locking=use_locking) - - self._optimizer = optimizer - self._allreduce_grads = _make_allreduce_grads_fn( - name, device_dense, device_sparse, grace, compression, sparse_as_dense, op, - gradient_predivide_factor, num_groups) - - self._agg_helper = None - if backward_passes_per_step > 1: - if _executing_eagerly(): - raise ValueError( - "backward_passes_per_step > 1 is not yet supported " - "for _LegacyOptimizer with eager execution." - ) - - self._agg_helper = LocalGradientAggregationHelper( - backward_passes_per_step=backward_passes_per_step, - allreduce_func=self._allreduce_grads, - sparse_as_dense=sparse_as_dense, - average_aggregated_gradients=average_aggregated_gradients, - rank=rank(), - optimizer_type=LocalGradientAggregationHelper._OPTIMIZER_TYPE_LEGACY, - ) - - def compute_gradients(self, *args, **kwargs): - """Compute gradients of all trainable variables. - - See Optimizer.compute_gradients() for more info. - - In DistributedOptimizer, compute_gradients() is overriden to also - allreduce the gradients before returning them. - """ - gradients = self._optimizer.compute_gradients(*args, **kwargs) - grads, vars = zip(*gradients) - if self._agg_helper: - avg_grads = self._agg_helper.compute_gradients(grads) - else: - avg_grads = self._allreduce_grads(gradients) - return list(zip(avg_grads, vars)) - - def apply_gradients(self, *args, **kwargs): - """Calls this same method on the underlying optimizer.""" - if self._agg_helper: - return self._agg_helper.apply_gradients( - lambda: self._optimizer.apply_gradients(*args, **kwargs), - self._optimizer, - *args, - **kwargs, - ) - - return self._optimizer.apply_gradients(*args, **kwargs) - - def get_slot(self, *args, **kwargs): - """Calls this same method on the underlying optimizer.""" - return self._optimizer.get_slot(*args, **kwargs) - - def get_slot_names(self, *args, **kwargs): - """Calls this same method on the underlying optimizer.""" - return self._optimizer.get_slot_names(*args, **kwargs) - - def variables(self, *args, **kwargs): - """Calls this same method on the underlying optimizer.""" - return self._optimizer.variables(*args, **kwargs) - - class _DistributedAdasumOptimizer(_LegacyOptimizer): - """An optimizer that wraps another tf.Optimizer, using an allreduce to - combine model deltas after applying gradients to model weights.""" - - def __init__(self, optimizer, name=None, use_locking=False, device_dense='', - device_sparse='', grace=None, compression=Compression.none, backward_passes_per_step=1): - if name is None: - name = "DistributedDelta{}".format(type(optimizer).__name__) - super(_DistributedAdasumOptimizer, self).__init__(name=name, use_locking=use_locking) - - self._optimizer = optimizer - self._name = name - self._device_dense = device_dense - self._device_sparse = device_sparse - self._grace = grace - self._compression = compression - self._backward_passes_per_step = backward_passes_per_step - - def _prepare(self): - self._step_count = tf.get_variable( - name="step_count", shape=[], dtype=tf.int64, trainable=False, - initializer=tf.zeros_initializer) - self._is_first_step = tf.cast(tf.math.equal(self._step_count, 0), dtype=tf.bool) - self._is_comm_step = tf.cast(tf.math.equal(self._step_count % self._backward_passes_per_step, self._backward_passes_per_step - 1), dtype=tf.bool) - - def _apply_shared(self, var, get_update_op): - start_slot = self._get_or_make_slot(var, "delta_start") - - # initialize start on the first step - assign_op = tf.cond(self._is_first_step, - lambda: start_slot.assign(var, use_locking=self.use_locking).op, - tf.no_op) - - with tf.control_dependencies([assign_op]): - update_op = get_update_op() - with tf.control_dependencies([update_op]): - def update(): - # delta = var - start - local_delta = var.assign_sub(start_slot, use_locking=self.use_locking) # reuse var's memory - # delta = allreduce (delta) - global_delta = allreduce(local_delta, - device_dense=self._device_dense, - device_sparse=self._device_sparse, - grace=self._grace, - compression=self._compression, - op=Adasum) - # start = start + delta - new_start = start_slot.assign_add(global_delta, use_locking=self.use_locking) - # var = start - return var.assign(new_start, use_locking=self.use_locking).op - - # if its a communication step, then apply logic above - # if its not a communication step then just have the underlying - # optimizer update the model parameters according to its logic - return tf.cond(self._is_comm_step, update, tf.no_op) - - def _apply_dense(self, grad, var): - return self._apply_shared(var, lambda: self._optimizer._apply_dense(grad, var)) - - def _resource_apply_dense(self, grad, handle): - return self._apply_shared(handle, lambda: self._optimizer._resource_apply_dense(grad, handle)) - - def _apply_sparse(self, grad, var): - return self._apply_shared(var, lambda: self._optimizer._apply_sparse(grad, var)) - - def _resource_apply_sparse(self, grad, handle, indices): - return self._apply_shared(handle, lambda: self._optimizer._resource_apply_sparse(grad, handle, indices)) - - def _finish(self, update_ops, name_scope): - with tf.control_dependencies(update_ops): - return tf.assign_add(self._step_count, 1) - - def compute_gradients(self, *args, **kwargs): - """Compute gradients of all trainable variables. - See Optimizer.compute_gradients() for more info. - """ - return self._optimizer.compute_gradients(*args, **kwargs) - - def apply_gradients(self, *args, **kwargs): - """Calls this same method on the underlying optimizer.""" - return self._optimizer.apply_gradients(*args, **kwargs) - - def get_slot(self, var, name): - """Calls this same method on the underlying optimizer.""" - tmp = super(_DistributedAdasumOptimizer, self).get_slot(var, name) - if tmp is not None: - return tmp - return self._optimizer.get_slot(var, name) - - def get_slot_names(self): - """Appends local slot names to those of the underlying optimizer.""" - return super(_DistributedAdasumOptimizer, self).get_slot_names() +\ - self._optimizer.get_slot_names() - - def variables(self, *args, **kwargs): - """Calls this same method on the underlying optimizer.""" - return self._optimizer.variables(*args, **kwargs) - - -def DistributedOptimizer(optimizer, name=None, use_locking=False, device_dense='', - device_sparse='', grace=None, compression=Compression.none, - sparse_as_dense=False, backward_passes_per_step=1, - op=Average, gradient_predivide_factor=1.0, - average_aggregated_gradients=False, - num_groups=0): - """Construct a new DistributedOptimizer, which uses another optimizer - under the hood for computing single-process gradient values and - applying gradient updates after the gradient values have been combined - across all the Horovod ranks. - - Args: - optimizer: - Optimizer to use for computing gradients and applying updates. - name: - Optional name prefix for the operations created when applying - gradients. Defaults to "Distributed" followed by the provided - optimizer type. - use_locking: - Whether to use locking when updating variables. - See Optimizer.__init__ for more info. - device_dense: - Device to be used for dense tensors. Uses GPU by default - if Horovod was built with HOROVOD_GPU_OPERATIONS. - device_sparse: - Device to be used for sparse tensors. Uses GPU by default - if Horovod was built with HOROVOD_GPU_OPERATIONS. - compression: - Compression algorithm used during allreduce to reduce the amount - of data sent during each parameter update step. Defaults to - not using compression. - sparse_as_dense: - Treat all sparse gradients as dense tensors. This can help improve - performance and memory utilization if the original sparse gradient - has high density. Defaults to false. - backward_passes_per_step: - Number of backward passes to perform before calling hvd.allreduce. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - op: - The reduction operation to use when combining gradients across - different ranks. - gradient_predivide_factor: - If op == Average, gradient_predivide_factor splits the averaging - before and after the sum. Gradients are scaled by - 1.0 / gradient_predivide_factor before the sum and - gradient_predivide_factor / size after the sum. - average_aggregated_gradients: - Whether to average the aggregated gradients that have been accumulated - over multiple mini-batches. If true divides gradients updates by - backward_passes_per_step. Only applicable for backward_passes_per_step > 1. - num_groups: - Number of groups to assign gradient allreduce ops to for explicit - grouping. Defaults to no explicit groups. - """ - if gradient_predivide_factor != 1.0: - if rocm_built(): - raise ValueError('gradient_predivide_factor not supported yet with ROCm') - if op != Average: - raise ValueError('gradient_predivide_factor not supported with op != Average') - - if op == Adasum and average_aggregated_gradients: - raise ValueError('Adasum does not support average_aggregated_gradients == True') - - if isinstance(optimizer, _LegacyOptimizer): - if op == Adasum: - return _DistributedAdasumOptimizer(optimizer, name, use_locking, device_dense, - device_sparse, grace, compression, backward_passes_per_step) - - return _DistributedOptimizer( - optimizer=optimizer, - name=name, - use_locking=use_locking, - device_dense=device_dense, - device_sparse=device_sparse, - grace=grace, - compression=compression, - sparse_as_dense=sparse_as_dense, - op=op, - gradient_predivide_factor=gradient_predivide_factor, - backward_passes_per_step=backward_passes_per_step, - average_aggregated_gradients=average_aggregated_gradients, - num_groups=num_groups - ) - elif isinstance(optimizer, tf.keras.optimizers.Optimizer): - if op == Adasum: - raise ValueError('op == Adasum is not supported yet with Keras') - - import horovod.tensorflow.keras as hvd_k - return hvd_k.DistributedOptimizer( - optimizer=optimizer, - name=name, - device_dense=device_dense, - device_sparse=device_sparse, - grace=grace, - compression=compression, - sparse_as_dense=sparse_as_dense, - gradient_predivide_factor=gradient_predivide_factor, - backward_passes_per_step=backward_passes_per_step, - average_aggregated_gradients=average_aggregated_gradients, - ) - else: - raise ValueError('Provided optimizer doesn\'t inherit from either legacy ' - 'TensorFlow or Keras optimizer: %s' % optimizer) - - -if hasattr(tf, 'GradientTape'): - class _DistributedGradientTape(tf.GradientTape): - def __init__(self, tape, device_dense, device_sparse, grace, compression, sparse_as_dense, op, - gradient_predivide_factor, num_groups, persistent=False, watch_accessed_variables=True): - if hasattr(tape, '_watch_accessed_variables'): - super(self.__class__, self).__init__(persistent, watch_accessed_variables) - else: - super(self.__class__, self).__init__(persistent) - - self._tape = tape - self._allreduce_grads = _make_allreduce_grads_fn( - 'DistributedGradientTape', device_dense, device_sparse, grace, compression, - sparse_as_dense, op, gradient_predivide_factor, num_groups) - - def gradient(self, target, sources, output_gradients=None): - gradients = super(self.__class__, self).gradient(target, sources, output_gradients) - return self._allreduce_grads(gradients) - - - def DistributedGradientTape(gradtape, device_dense='', device_sparse='', - grace=None, compression=Compression.none, sparse_as_dense=False, - op=Average, gradient_predivide_factor=1.0, - num_groups=0): - """A tape that wraps another tf.GradientTape, using an allreduce to - combine gradient values before applying gradients to model weights. - - Args: - gradtape: - GradientTape to use for computing gradients and applying updates. - device_dense: - Device to be used for dense tensors. Uses GPU by default - if Horovod was built with HOROVOD_GPU_OPERATIONS. - device_sparse: - Device to be used for sparse tensors. Uses GPU by default - if Horovod was built with HOROVOD_GPU_OPERATIONS. - compression: - Compression algorithm used during allreduce to reduce the amount - of data sent during each parameter update step. Defaults to - not using compression. - sparse_as_dense: - Treat all sparse gradients as dense tensors. This can help improve - performance and memory utilization if the original sparse gradient - has high density. Defaults to false. - op: - The reduction operation to use when combining gradients across - different ranks. - gradient_predivide_factor: - If op == Average, gradient_predivide_factor splits the averaging - before and after the sum. Gradients are scaled by - 1.0 / gradient_predivide_factor before the sum and - gradient_predivide_factor / size after the sum. - num_groups: - Number of groups to assign gradient allreduce ops to for explicit - grouping. Defaults to no explicit groups. - """ - if gradient_predivide_factor != 1.0: - if rocm_built(): - raise ValueError('gradient_predivide_factor not supported yet with ROCm') - if op != Average: - raise ValueError('gradient_predivide_factor not supported with op != Average') - - cls = type(gradtape.__class__.__name__, (gradtape.__class__,), - dict(_DistributedGradientTape.__dict__)) - if hasattr(gradtape, '_watch_accessed_variables'): - return cls(gradtape._tape, device_dense, device_sparse, grace, compression, - sparse_as_dense, op, gradient_predivide_factor, num_groups, - gradtape._persistent, gradtape._watch_accessed_variables) - else: - return cls(gradtape._tape, device_dense, device_sparse, grace, compression, - sparse_as_dense, op, gradient_predivide_factor, num_groups, - gradtape._persistent) \ No newline at end of file diff --git a/patch_files/horovod/tensorflow/keras/__init__.py b/patch_files/horovod/tensorflow/keras/__init__.py deleted file mode 100644 index 31bfac3..0000000 --- a/patch_files/horovod/tensorflow/keras/__init__.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright 2018 Uber Technologies, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import inspect - -import tensorflow as tf - -from tensorflow import keras -from tensorflow.python.keras import backend as K - -from horovod.tensorflow import init -from horovod.tensorflow import shutdown -from horovod.tensorflow import is_initialized, start_timeline, stop_timeline -from horovod.tensorflow import size -from horovod.tensorflow import local_size -from horovod.tensorflow import rank -from horovod.tensorflow import local_rank -from horovod.tensorflow import mpi_threads_supported, mpi_enabled, mpi_built -from horovod.tensorflow import gloo_enabled, gloo_built -from horovod.tensorflow import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built -from horovod.tensorflow import Average, Compression, Sum - -import horovod._keras as _impl -from horovod.tensorflow.keras import callbacks, elastic - - -try: - # In later versions of TensorFlow, optimizers are spread across multiple modules. This set is used to distinguish - # stock optimizers that come with tf.keras from custom optimizers that may need to be wrapped specially. - _OPTIMIZER_MODULES = set([obj.__module__ for name, obj in inspect.getmembers(tf.keras.optimizers) - if isinstance(obj, type(tf.keras.optimizers.Optimizer))]) -except: - _OPTIMIZER_MODULES = set() - - -def DistributedOptimizer(optimizer, name=None, - device_dense='', device_sparse='', - grace=None, - compression=Compression.none, - sparse_as_dense=False, - gradient_predivide_factor=1.0, - op=Average, - backward_passes_per_step=1, - average_aggregated_gradients=False): - """ - An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to - average gradient values before applying gradients to model weights. - - Args: - optimizer: Optimizer to use for computing gradients and applying updates. - name: Optional name prefix for the operations created when applying - gradients. Defaults to "Distributed" followed by the provided - optimizer type. - device_dense: Device to be used for dense tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_OPERATIONS. - device_sparse: Device to be used for sparse tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_OPERATIONS. - compression: Compression algorithm used to reduce the amount of data - sent and received by each worker node. Defaults to not - using compression. - sparse_as_dense: Treat all sparse gradients as dense tensors. This can - help improve performance and memory utilization if - the original sparse gradient has high density. - Defaults to false. - gradient_predivide_factor: gradient_predivide_factor splits the averaging - before and after the sum. Gradients are scaled by - 1.0 / gradient_predivide_factor before the sum and - gradient_predivide_factor / size after the sum. - op: The reduction operation to use when combining gradients across - different ranks. Defaults to Average. - backward_passes_per_step: Number of backward passes to perform before calling - hvd.allreduce. This allows accumulating updates over - multiple mini-batches before reducing and applying them. - average_aggregated_gradients: Whether to average the aggregated gradients that - have been accumulated over multiple mini-batches. - If true divides gradient updates by - backward_passes_per_step. - Only applicable for backward_passes_per_step > 1. - """ - if gradient_predivide_factor != 1.0 and rocm_built(): - raise ValueError('gradient_predivide_factor not supported yet with ROCm') - - if op != Average and op != Sum: - raise ValueError('op currently only supports Average and Sum') - - return _impl.create_distributed_optimizer( - keras=keras, - optimizer=optimizer, - name=name, - device_dense=device_dense, - device_sparse=device_sparse, - grace=grace, - compression=compression, - sparse_as_dense=sparse_as_dense, - gradient_predivide_factor=gradient_predivide_factor, - op=op, - backward_passes_per_step=backward_passes_per_step, - average_aggregated_gradients=average_aggregated_gradients, - ) - - -def broadcast_global_variables(root_rank): - """Broadcasts all global variables from root rank to all other processes. - - Arguments: - root_rank: Rank of the process from which global variables will be broadcasted - to all other processes. - """ - return _impl.broadcast_global_variables(K, root_rank) - - -def allreduce(value, name=None, average=True, - prescale_factor=1.0, - postscale_factor=1.0, - op=None, - grace=None, - compression=Compression.none): - """ - Perform an allreduce on a tensor-compatible value. - - Arguments: - value: A tensor-compatible value to reduce. - The shape of the input must be identical across all ranks. - name: Optional name for the constants created by this operation. - average: - .. warning:: .. deprecated:: 0.19.0 - - Use `op` instead. Will be removed in v0.21.0. - - prescale_factor: Multiplicative factor to scale tensor before allreduce. - postscale_factor: Multiplicative factor to scale tensor after allreduce. - op: The reduction operation to combine tensors across different ranks. - Defaults to Average if None is given. - compression: Compression algorithm used to reduce the amount of data - sent and received by each worker node. Defaults to not - using compression. - """ - return _impl.allreduce( - backend=K, - value=value, - name=name, - average=average, - prescale_factor=prescale_factor, - postscale_factor=postscale_factor, - op=op, - grace=grace, - compression=compression) - - -def allgather(value, name=None): - """ - Perform an allgather on a tensor-compatible value. - - The concatenation is done on the first dimension, so the input values on the - different processes must have the same rank and shape, except for the first - dimension, which is allowed to be different. - - Arguments: - value: A tensor-compatible value to gather. - name: Optional name prefix for the constants created by this operation. - """ - return _impl.allgather(K, value, name) - - -def broadcast(value, root_rank, name=None): - """ - Perform a broadcast on a tensor-compatible value. - - Arguments: - value: A tensor-compatible value to reduce. - The shape of the input must be identical across all ranks. - root_rank: Rank of the process from which global variables will be - broadcasted to all other processes. - name: Optional name for the constants created by this operation. - """ - return _impl.broadcast(K, value, root_rank, name) - - -def load_model(filepath, custom_optimizers=None, custom_objects=None, grace=None, compression=Compression.none): - """ - Loads a saved Keras model with a Horovod DistributedOptimizer. - - The DistributedOptimizer will wrap the underlying optimizer used to train - the saved model, so that the optimizer state (params and weights) will - be picked up for retraining. - - By default, all optimizers in the module `keras.optimizers` will be loaded - and wrapped without needing to specify any `custom_optimizers` or - `custom_objects`. - - Arguments: - filepath: One of the following: - - string, path to the saved model, or - - h5py.File object from which to load the model - custom_optimizers: Optional list of Optimizer subclasses to support - during loading. - custom_objects: Optional dictionary mapping names (strings) to custom - classes or functions to be considered during deserialization. - compression: Compression algorithm used to reduce the amount of data - sent and received by each worker node. Defaults to not - using compression. - - Returns: - A Keras model instance. - - Raises: - ImportError: If h5py is not available. - ValueError: In case of an invalid savefile. - """ - def wrap_optimizer(cls): - return lambda **kwargs: DistributedOptimizer(cls(**kwargs), grace=grace, compression=compression) - return _impl.load_model(keras, wrap_optimizer, _OPTIMIZER_MODULES, filepath, custom_optimizers, custom_objects) - diff --git a/update_patch_files.sh b/update_patch_files.sh index fffe711..fe83588 100755 --- a/update_patch_files.sh +++ b/update_patch_files.sh @@ -24,11 +24,6 @@ then fi -cp -v patch_files/horovod/tensorflow/__init__.py ${HOROVOD_PATH}/tensorflow/ -cp -v patch_files/horovod/tensorflow/keras/__init__.py ${HOROVOD_PATH}/tensorflow/keras/ -cp -v patch_files/horovod/_keras/__init__.py ${HOROVOD_PATH}/_keras/ cp -v patch_files/horovod/torch/__init__.py ${HOROVOD_PATH}/torch/ cp -v patch_files/horovod/torch/mpi_ops.py ${HOROVOD_PATH}/torch/ - -cp -v patch_files/horovod/keras/__init__.py ${HOROVOD_PATH}/keras/ cp -v patch_files/horovod/torch/optimizer.py ${HOROVOD_PATH}/torch/ \ No newline at end of file From 50d1cdf9aab5f5a1cfe28d9ecc60076647860497 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Sat, 20 Jul 2024 17:15:27 -0400 Subject: [PATCH 03/44] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index a777d33..53287ec 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,10 @@ # GRACE - GRAdient ComprEssion for distributed deep learning +##Updated GRACE for new version of Horovod in Pytorch + + + + ## Context Powerful computer clusters are used nowadays to train complex deep neural networks (DNN) on large datasets. Distributed training workloads increasingly become communication bound. For this reason, many lossy compression techniques have been proposed to reduce the volume of transferred data, and consequently, accelerate training. From 7df7e5123d0e5953ce9c578f474d1f396d787dd2 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Sat, 20 Jul 2024 17:16:29 -0400 Subject: [PATCH 04/44] Update README.md --- README.md | 66 ++----------------------------------------------------- 1 file changed, 2 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index 53287ec..edd85ba 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # GRACE - GRAdient ComprEssion for distributed deep learning -##Updated GRACE for new version of Horovod in Pytorch - +## Updated GRACE for new version of Horovod in Pytorch +Updated Patch files @@ -15,65 +15,3 @@ GRACE includes the implementation of popular compression algorithms surveyed in GRACE supports both TensorFlow and Pytorch, and offers easy integration with training scripts. This simplifies the process of implemeting and comparing across compression methods. ![grace_bert_bench](https://user-images.githubusercontent.com/52653060/174394060-8f616ed0-0fca-424d-ab9f-2648423d11c2.jpg) - - -## Install -The `grace_dl` package can be install locally on your nodes. For Horovod support, it is required to apply -the patch files provided. You can find more information [here](INSTALLING.md). - -## Concepts -The three main components of this framework are the `Communicator`, `Compressor`, and `Memory` abstract classes. -- `Communicator` implementations define the communication method used by GRACE. -- `Compressor` implementations provide the `compress` and `decompress` methods. -- `Memory` implementations provide the `update` and `compensate` methods. - -## Supported frameworks -Currently the supported frameworks are: -- Horovod 0.21.0 ([TensorFlow 1.15](grace_dl/tensorflow)) -- Horovod 0.21.0 ([PyTorch 1.7.0](grace_dl/torch)) -- DistributedDataParallel (DDP) ([PyTorch >= 1.4.0](grace_dl/dist)) - -## Usage -To use GRACE: -- **As a User**: you need to change a few lines of code on your training script. Refer [here](TRAINING.md) for examples. -- **As an Implementer of new Compressors**: you need to implement a `Compressor` class, and optionally a `Memory` or even a `Communicator` class. -Refer [here](IMPLEMENTING.md) for more information. -- **For Benchmarking methods**: you can use the [training scripts](https://github.com/sands-lab/grace-benchmarks) and compare with [our data](https://github.com/sands-lab/grace-data). - -## Running -Running a training script is no different with GRACE. See the Training guide for more information. - -## Guides -- [Installing GRACE](INSTALLING.md) -- [Implementing GRACE components](IMPLEMENTING.md) -- [Training with GRACE](TRAINING.md) - -## Citation - -If you find this useful, please cite our work as: - -Hang Xu, Chen-Yu Ho, Ahmed M Abdelmoniem, Aritra Dutta, El Houcine Bergou, Konstantinos Karatsenidis, Marco Canini, and Panos Kalnis. GRACE: A Compressed Communication Framework for Distributed Machine Learning. In Proc. of ICDCS, 2021. - -Here's a BibTeX blurb: - -``` -@inproceedings{xu2021grace, - title={GRACE: A compressed communication framework for distributed machine learning}, - author={Xu, Hang and Ho, Chen-Yu and Abdelmoniem, Ahmed M. and Dutta, Aritra and Bergou, EH and Karatsenidis, Konstantinos and Canini, Marco and Kalnis, Panos}, - booktitle={Proc. of 41st IEEE Int. Conf. Distributed Computing Systems (ICDCS)}, - year={2021} -} -``` - -## Publications - -* [GRACE: A Compressed Communication Framework for Distributed Machine Learning](https://sands.kaust.edu.sa/papers/grace.icdcs21.pdf).
- H. Xu, C.-Y. Ho, A. M. Abdelmoniem, A. Dutta, E. H. Bergou, K. Karatsenidis, M. Canini, P. Kalnis.
- In Proc. of ICDCS, 2021. - -## Mailing lists - -There is no official mailing list set up yet. Feel free to reach out by email to any of the authors indicated in the [paper](https://sands.kaust.edu.sa/papers/grace.icdcs21.pdf). - -## License -See [LICENSE](LICENSE) for the details. From ca343d396cae3a73ee2a0232c3831ecf7bfa68fa Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Sat, 20 Jul 2024 17:16:45 -0400 Subject: [PATCH 05/44] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index edd85ba..feaed8f 100644 --- a/README.md +++ b/README.md @@ -14,4 +14,3 @@ It provides a general framework and an API for implementing gradient compression GRACE includes the implementation of popular compression algorithms surveyed in [GRACE: A Compressed Communication Framework for Distributed Machine Learning](https://sands.kaust.edu.sa/papers/grace.icdcs21.pdf). GRACE supports both TensorFlow and Pytorch, and offers easy integration with training scripts. This simplifies the process of implemeting and comparing across compression methods. -![grace_bert_bench](https://user-images.githubusercontent.com/52653060/174394060-8f616ed0-0fca-424d-ab9f-2648423d11c2.jpg) From 90c3bf26a6e262837dbe9b3b082c30a3edb71d8c Mon Sep 17 00:00:00 2001 From: Garvit Banga Date: Mon, 22 Jul 2024 00:45:45 -0400 Subject: [PATCH 06/44] HOROVOD new version compatible for pytorch --- .DS_Store | Bin 8196 -> 8196 bytes grace_dl/.DS_Store | Bin 6148 -> 6148 bytes grace_dl/torch/.DS_Store | Bin 6148 -> 8196 bytes grace_dl/torch/__init__.py | 12 ++++++------ grace_dl/torch/communicator/allgather.py | 13 +++++++++---- grace_dl/torch/compressor/topk.py | 3 ++- grace_dl/torch/helper.py | 5 +++-- grace_dl/torch/memory/residual.py | 3 +++ patch_files/.DS_Store | Bin 6148 -> 6148 bytes patch_files/horovod/.DS_Store | Bin 6148 -> 6148 bytes patch_files/horovod/torch/mpi_ops.py | 3 --- patch_files/horovod/torch/optimizer.py | 12 +++++------- 12 files changed, 28 insertions(+), 23 deletions(-) diff --git a/.DS_Store b/.DS_Store index 64c196cbe54c373dd976e1595fa9714f12bfc5b9..c474854b599343a676b4213aab1d838793533c45 100644 GIT binary patch delta 111 zcmZp1XmOa}&nUVvU^hRb=w=>)GUmw&0!?g2Itqs7CX-hPI7}815$8-PPR>cn&(C3) xEGQy1Sxk^`@@C-<5SyngxF|0tKQEnufq`*j;c=$T>=NHtCd-M)Vv#+<1ONhmATR&` delta 55 zcmZp1XmOa}&nUbxU^hRb@Ma!?GG@+{;^ds9{QMlo$sa_dCW{I3O`asuvDupE4D)7o LiEk_$E7+L<^wJUm diff --git a/grace_dl/.DS_Store b/grace_dl/.DS_Store index d7016090c772ce146d2f15a2161623ee629dab4f..7c72073262c1b15b1b9e63219482b21ec115091a 100644 GIT binary patch delta 21 ccmZoMXffFEk&Q#w)KW*m$jor_Pd08r08KjvW&i*H delta 21 ccmZoMXffFEk&VN|*ic8o(A;$MPd08r08KpxX8-^I diff --git a/grace_dl/torch/.DS_Store b/grace_dl/torch/.DS_Store index 2ecd6ce1255a2c865a79a3e17165e33d226de615..1b1428a405518204e00c4423c42b291248b8985c 100644 GIT binary patch delta 277 zcmZoMXmOBWU|?W$DortDU;r^WfEYvza8E20o2aMAD7rCVH}hr%jz7$c**Q2SHn1>? zPPSu}nXJLmz@ckusiRh2t+YXelE0S@)VJU6H8YtX6N7#WCrR40s(Fy;R-TjW8rt^$^0^&AR`%= PAjX4iWY`?fGlv-fYQz|* diff --git a/grace_dl/torch/__init__.py b/grace_dl/torch/__init__.py index 12900df..4030ee4 100644 --- a/grace_dl/torch/__init__.py +++ b/grace_dl/torch/__init__.py @@ -36,23 +36,23 @@ def aggregate(self, tensors): class Communicator(ABC): @abstractmethod - def async_send(self, tensors, name): + def async_send(self, tensors, name,process_set): raise NotImplemented("async_send was not implemented.") @abstractmethod - def wait_receive(self, handles, ctx): + def wait_receive(self, handles, ctx,process_set): raise NotImplemented("wait_receive was not implemented.") def __init__(self, compressor, memory): self.compressor = compressor self.memory = memory - def send_step(self, tensor, name): + def send_step(self, tensor, name,process_set): tensor = self.memory.compensate(tensor, name) tensors_compressed, ctx = self.compressor.compress(tensor, name) self.memory.update(tensor, name, self.compressor, tensors_compressed, ctx) - handles = self.async_send(tensors_compressed, name) + handles = self.async_send(tensors_compressed, name,process_set) return handles, ctx - def receive_step(self, handles, ctx): - return self.wait_receive(handles, ctx) + def receive_step(self, handles, ctx,process_set): + return self.wait_receive(handles, ctx,process_set) diff --git a/grace_dl/torch/communicator/allgather.py b/grace_dl/torch/communicator/allgather.py index bd9527f..ff95a6c 100644 --- a/grace_dl/torch/communicator/allgather.py +++ b/grace_dl/torch/communicator/allgather.py @@ -9,7 +9,8 @@ def __init__(self, compressor, memory, world_size): super().__init__(compressor, memory) self.world_size = world_size - def async_send(self, tensors_compressed, name): + + def async_send(self, tensors_compressed, name,process_set): """ :param tensors_compressed: list of flat tensors to communicate :param name: for the all_gather operation @@ -26,21 +27,25 @@ def async_send(self, tensors_compressed, name): tensor_sizes = zip(*tensors_size_ag) # transpose else: tensors_size = torch.tensor(tensors_size) # TODO: set device - gathered = allgather(tensors_size) # tensor of tensor sizes per rank + gathered = allgather(tensor=tensors_size,process_set=process_set) # tensor of tensor sizes per rank tensor_sizes = gathered.view([self.world_size, -1]).t().tolist() # transpose, to list handles = [] for tensor_compressed in tensors_compressed: - handle = allgather_async(tensor_compressed) + handle = allgather_async(tensor=tensor_compressed,process_set=process_set) handles.append(handle) return handles, tensor_sizes - def wait_receive(self, result, ctx): + def wait_receive(self, result, ctx,process_set): handles, tensor_sizes = result + # print("Result",result) tensors_ag = [] for handle, sizes in zip(handles, tensor_sizes): gathered = synchronize(handle) + # print("gathered",gathered) + # print("gathered",len(gathered)) + # print("sizes",sizes) tensors_ag.append(gathered.split(sizes)) # import time # time.sleep(0.001) diff --git a/grace_dl/torch/compressor/topk.py b/grace_dl/torch/compressor/topk.py index 866d63a..9ccfc2d 100644 --- a/grace_dl/torch/compressor/topk.py +++ b/grace_dl/torch/compressor/topk.py @@ -13,6 +13,7 @@ def sparsify(tensor, compress_ratio): def desparsify(tensors, numel): values, indices = tensors + # print("values, indices",values, indices,numel) tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) tensor_decompressed.scatter_(0, indices, values) return tensor_decompressed @@ -21,7 +22,7 @@ def desparsify(tensors, numel): class TopKCompressor(Compressor): def __init__(self, compress_ratio): - super().__init__() + super().__init__()#tensors_size_are_same=False self.compress_ratio = compress_ratio def compress(self, tensor, name): diff --git a/grace_dl/torch/helper.py b/grace_dl/torch/helper.py index cb05989..0796893 100644 --- a/grace_dl/torch/helper.py +++ b/grace_dl/torch/helper.py @@ -1,9 +1,10 @@ -def grace_from_params(params): +def grace_from_params(params,processset): import sys sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") sys.path.append("/content/grace/") import horovod.torch as hvd - world_size = hvd.size() + world_size = processset.size() + # print("world_size",world_size,processset.size()) comp = params.get('compressor', 'none') mem = params.get('memory', 'none') comm = params.get('communicator', 'allreduce') diff --git a/grace_dl/torch/memory/residual.py b/grace_dl/torch/memory/residual.py index 55819a6..6054d25 100644 --- a/grace_dl/torch/memory/residual.py +++ b/grace_dl/torch/memory/residual.py @@ -9,12 +9,15 @@ def __init__(self, beta=1.0, gamma=1.0): def compensate(self, tensor, name): """Update the tensor with the residuals.""" + # print("inside residuals compensate") if name in self.residuals: tensor = self.beta * self.residuals[name] + self.gamma * tensor + # print("inside residuals",self.residuals[name] ) return tensor def update(self, tensor, name, compressor, tensor_compressed, ctx): """Update the residuals.""" tensor_decompressed = compressor.decompress(tensor_compressed, ctx) residual = tensor - tensor_decompressed + # print("residual update",residual) self.residuals[name] = residual diff --git a/patch_files/.DS_Store b/patch_files/.DS_Store index 7c8bd46e92c329f2e7b3f2d9be9f7a15c27f34c2..dced0a2b4db5d862546c4f16fe24b4921fb03d32 100644 GIT binary patch delta 19 acmZoMXffEZfQijWN5RnCWb-nnI8gvRq6Mh{ delta 19 acmZoMXffEZfQijeN5RnCbn`N%I8gvRodu}? diff --git a/patch_files/horovod/.DS_Store b/patch_files/horovod/.DS_Store index 5f8c5e7ffad68965e260075bb66a427e32822fe8..94a06d9e8f063fb315ae1ec923cfc7fdb4cfef34 100644 GIT binary patch delta 19 acmZoMXffC@hl$NdN5RnCWb-1XBvAl7Vg;oD delta 19 acmZoMXffC@hl$NlN5RnCbn_ynBvAl7T?M58 diff --git a/patch_files/horovod/torch/mpi_ops.py b/patch_files/horovod/torch/mpi_ops.py index 688e89f..8ee3474 100644 --- a/patch_files/horovod/torch/mpi_ops.py +++ b/patch_files/horovod/torch/mpi_ops.py @@ -104,9 +104,6 @@ def _check_function(function_factory, tensor): def _allreduce_function_factory(tensor): - # print("tensor",tensor) - # print("tensor",tensor[0].type()) - # exit() return 'horovod_torch_allreduce_async_' + tensor.type().replace('.', '_') diff --git a/patch_files/horovod/torch/optimizer.py b/patch_files/horovod/torch/optimizer.py index 8c742ae..7e3f3b1 100644 --- a/patch_files/horovod/torch/optimizer.py +++ b/patch_files/horovod/torch/optimizer.py @@ -42,6 +42,7 @@ def __init__(self, params, named_parameters,grace, compression, process_set=global_process_set): super(self.__class__, self).__init__(params) self._grace = grace + self._process_set=process_set self._compression = compression if named_parameters is not None: @@ -193,11 +194,8 @@ def _allreduce_grad_async(self, p): tensor = tensor.to_dense() else: return self._sparse_allreduce_grad_async(p, name) - # print("self._compression.",self._compression.compress) - # print("self._compression.",self._compression.compressor) - if self._grace and self._groups == 0 and self.op == Average: - # print("HERE") - handle, ctx = self._grace.send_step(tensor, name) + if self._grace and self._groups is None and self.op == Average: + handle, ctx = self._grace.send_step(tensor, name,self._process_set) postscale_factor = self.gradient_predivide_factor else: @@ -290,8 +288,8 @@ def synchronize(self): self._group_counts[p] = 0 else: # When handle is a callable function, it returns the aggregated tensor result - if self._grace and self._groups == 0 and self.op == Average: - output = self._grace.receive_step(handle, ctx) + if self._grace and self._groups is None and self.op == Average: + output = self._grace.receive_step(handle, ctx,self._process_set) p.grad.set_(output) else: output = synchronize(handle) if not callable(handle) else handle() From 2143780810571e489332cbf29f085ac37953938c Mon Sep 17 00:00:00 2001 From: Garvit Banga Date: Mon, 22 Jul 2024 00:48:01 -0400 Subject: [PATCH 07/44] HOROVOD new version compatible for pytorch --- .DS_Store | Bin 8196 -> 8196 bytes examples/.DS_Store | Bin 6148 -> 0 bytes examples/dist/.DS_Store | Bin 6148 -> 0 bytes .../tensorflow/tensorflow2_keras_mnist.py | 15 +- examples/tensorflow/tensorflow2_mnist.py | 12 +- examples/tensorflow/tensorflow_keras_mnist.py | 34 +- examples/torch/.DS_Store | Bin 6148 -> 0 bytes examples/torch/hvd.py | 289 ------------- examples/torch/hvd1.py | 92 ---- examples/torch/pytorch_mnist.py | 400 +++++++----------- grace_dl/.DS_Store | Bin 6148 -> 6148 bytes grace_dl/torch/.DS_Store | Bin 8196 -> 8196 bytes patch_files/.DS_Store | Bin 6148 -> 6148 bytes patch_files/horovod/.DS_Store | Bin 6148 -> 6148 bytes update_patch_files.sh | 5 + 15 files changed, 168 insertions(+), 679 deletions(-) delete mode 100644 examples/.DS_Store delete mode 100644 examples/dist/.DS_Store delete mode 100644 examples/torch/.DS_Store delete mode 100644 examples/torch/hvd.py delete mode 100644 examples/torch/hvd1.py diff --git a/.DS_Store b/.DS_Store index c474854b599343a676b4213aab1d838793533c45..bad07e965141e67ad73cab84e3ab9c0ed6fb1380 100644 GIT binary patch delta 45 ucmZp1XmQx!C&Zy^YN?}OWM(*-OT=OFG$BbwhRqv0OT=OFG$Bbw#?2dqf*66~Mj&yM&0HeCxB(Fw3y=T+ diff --git a/examples/.DS_Store b/examples/.DS_Store deleted file mode 100644 index 7142159d8408f829308003c77c7f4ae722e0e806..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKL2uJA6n@?^mTnrV1V{`nmN-(0h80a{mr%-y8$ob@scV|BMx=4oq|2eI$b}R9 z2>u3tfIAod0SEpC@7ZoiwiIp%4L`~LY`^!M`1=~W1_0Kb1r9(H0BkIT)(%!RMt&+6 zY(tM+Lu52ZkPc&Ch!{J@3~7QQbhj76{K z$HK=k`}n*6v`?OprW8;RV@2H7m_4yLcD_Z&AOf7u@RFQPE-H?KbyNj8X*a4Hj`AjFW@ zFQX(7c~{JmAeFhEa@Y;K(Qoe0=beL2i#vx0ix!_d?Zf*m-g)?7v1r&g?%X?i)_)t1 z6Y+uG9yh|}=C`N%tNJ6hY8O{#y*8kt06e|DZlbapdv}LRHEQP^i(kt8DYl{#oY0DD z=REVKNi336^b-kNs3d7PgwbWuue7A4YQ#oqxT@8yuT^hZp?A(op;CTXm->jR8vNYz z&uKv3wWfcm_U_sU<1h>u2L4wDXn){X2wjbZLb-KdC9eR877k0pHr*vChih~-77EdW z##AbzN@e!V>MsbBgnSzx*j%9<6;+t5cp^sAo W(bZTe#1S;}M?lhGD#O5EW#A`v8~WG) diff --git a/examples/dist/.DS_Store b/examples/dist/.DS_Store deleted file mode 100644 index 086c2e5940a62da8ca0fabc11f205d5c2741ddae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKyG{c!5S)b+k!Y@@^ba7BKR7}_qJyKdtbO4rgM|}A(H{W-k*-1r=Naq>%Sm6>E81OX9{ypH_4%fKDEv_c~ zPrUJ*SM2dV>|eIeyWQ54Wl}&2NC7Dz1*E`13RF2=o*i_G$WlNG9HRpMeQ0#YzHm&8 zPX~u+0f-BR!#Iy#g4jGj>)|36Qdk+;pO-|k}|LPocn#@m>6`%gHF`Xfa@ZY0{^YR4< `lr = 1.0 * hvd.size()` during # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, verbose=1,initial_lr=0.01), + hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, verbose=1), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. @@ -94,4 +93,4 @@ # Train the model. # Horovod: adjust number of steps based on number of GPUs. -mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=24, verbose=verbose) \ No newline at end of file +mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=24, verbose=verbose) diff --git a/examples/tensorflow/tensorflow2_mnist.py b/examples/tensorflow/tensorflow2_mnist.py index 515705b..e61bbcc 100644 --- a/examples/tensorflow/tensorflow2_mnist.py +++ b/examples/tensorflow/tensorflow2_mnist.py @@ -14,15 +14,11 @@ # ============================================================================== import tensorflow as tf -import sys -sys.path.append("/content/") -sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/") -sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") import horovod.tensorflow as hvd -from grace.grace_dl.tensorflow.communicator.allgather import Allgather -from grace.grace_dl.tensorflow.compressor.topk import TopKCompressor -from grace.grace_dl.tensorflow.memory.residual import ResidualMemory +from grace_dl.tensorflow.communicator.allgather import Allgather +from grace_dl.tensorflow.compressor.topk import TopKCompressor +from grace_dl.tensorflow.memory.residual import ResidualMemory # Horovod: initialize Horovod. hvd.init() @@ -72,7 +68,7 @@ def training_step(images, labels, first_batch): loss_value = loss(labels, probs) # Horovod: add Horovod Distributed GradientTape. - tape = hvd.DistributedGradientTape(tape, grc) + tape = hvd.DistributedGradientTape(tape, grace=grc) grads = tape.gradient(loss_value, mnist_model.trainable_variables) opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) diff --git a/examples/tensorflow/tensorflow_keras_mnist.py b/examples/tensorflow/tensorflow_keras_mnist.py index a558130..2d6fce1 100644 --- a/examples/tensorflow/tensorflow_keras_mnist.py +++ b/examples/tensorflow/tensorflow_keras_mnist.py @@ -1,44 +1,28 @@ -# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== from __future__ import print_function import math -import sys -sys.path.append("/content/") -print(sys.path) + import tensorflow as tf from tensorflow import keras -from tensorflow.compat.v1.keras import backend as K +from tensorflow.keras import backend as K from tensorflow.keras.datasets import mnist from tensorflow.keras.layers import Conv2D, MaxPooling2D from tensorflow.keras.layers import Dense, Dropout, Flatten from tensorflow.keras.models import Sequential import horovod.tensorflow.keras as hvd -from grace.grace_dl.tensorflow.communicator.allgather import Allgather -from grace.grace_dl.tensorflow.compressor.topk import TopKCompressor -from grace.grace_dl.tensorflow.memory.residual import ResidualMemory +from grace_dl.tensorflow.communicator.allgather import Allgather +from grace_dl.tensorflow.compressor.topk import TopKCompressor +from grace_dl.tensorflow.memory.residual import ResidualMemory # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.compat.v1.ConfigProto() +config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) -K.set_session(tf.compat.v1.Session(config=config)) +K.set_session(tf.Session(config=config)) batch_size = 128 num_classes = 10 @@ -92,7 +76,7 @@ grc = Allgather(TopKCompressor(0.3), ResidualMemory(), hvd.size()) # Horovod: add Horovod Distributed Optimizer. -opt = hvd.DistributedOptimizer(opt, grc) +opt = hvd.DistributedOptimizer(opt, grace=grc) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, @@ -117,4 +101,4 @@ validation_data=(x_test, y_test)) score = model.evaluate(x_test, y_test, verbose=0) print('Test loss:', score[0]) -print('Test accuracy:', score[1]) \ No newline at end of file +print('Test accuracy:', score[1]) diff --git a/examples/torch/.DS_Store b/examples/torch/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0= 1.6.0""") - - # Horovod: limit # of CPU threads to be used per worker. - torch.set_num_threads(1) - - kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} - # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent - # issues with Infiniband implementations that are not fork-safe - if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and - mp._supports_context and 'forkserver' in mp.get_all_start_methods()): - kwargs['multiprocessing_context'] = 'forkserver' - - data_dir = args.data_dir or './data' - with FileLock(os.path.expanduser("~/.horovod_lock")): - train_dataset = \ - datasets.MNIST(data_dir, train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - - # Horovod: use DistributedSampler to partition the training data. - train_sampler = torch.utils.data.distributed.DistributedSampler( - train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) - train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) - - test_dataset = \ - datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - # Horovod: use DistributedSampler to partition the test data. - test_sampler = torch.utils.data.distributed.DistributedSampler( - test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) - test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, - sampler=test_sampler, **kwargs) - - model = Net() - - # By default, Adasum doesn't need scaling up learning rate. - lr_scaler = hvd.size() if not args.use_adasum else 1 - - if args.cuda: - # Move model to GPU. - model.cuda() - # If using GPU Adasum allreduce, scale learning rate by local_size. - if args.use_adasum and hvd.nccl_built(): - lr_scaler = hvd.local_size() - - # Horovod: scale learning rate by lr_scaler. - optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, - momentum=args.momentum) - - # Horovod: broadcast parameters & optimizer state. - hvd.broadcast_parameters(model.state_dict(), root_rank=0) - hvd.broadcast_optimizer_state(optimizer, root_rank=0) - - # Horovod: (optional) compression algorithm. - compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none - from grace.grace_dl.torch.helper import grace_from_params - params = {'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather'} - grc = grace_from_params(params) - # Horovod: wrap optimizer with DistributedOptimizer. - if(hvd.rank()%2==0): - optimizer = hvd.DistributedOptimizer(optimizer, - named_parameters=model.named_parameters(), - grace=grc, - # compression=compression, - op=hvd.Adasum if args.use_adasum else hvd.Average, - gradient_predivide_factor=args.gradient_predivide_factor, - process_set=even_set) - else: - optimizer = hvd.DistributedOptimizer(optimizer, - named_parameters=model.named_parameters(), - grace=grc, - # compression=compression, - op=hvd.Adasum if args.use_adasum else hvd.Average, - gradient_predivide_factor=args.gradient_predivide_factor, - process_set=odd_set) - if args.use_mixed_precision: - # Initialize scaler in global scale - scaler = torch.cuda.amp.GradScaler() - - for epoch in range(1, args.epochs + 1): - if args.use_mixed_precision: - train_mixed_precision(epoch, scaler) - else: - train_epoch(epoch) - # Keep test in full precision since computation is relatively light. - test() - - -if __name__ == '__main__': - args = parser.parse_args() - args.cuda = not args.no_cuda and torch.cuda.is_available() - - if args.num_proc: - # run training through horovod.run - print('Running training through horovod.run') - horovod.run(main, - args=(args,), - np=args.num_proc, - hosts=args.hosts, - use_gloo=args.communication == 'gloo', - use_mpi=args.communication == 'mpi') - else: - # this is running via horovodrun - main(args) \ No newline at end of file diff --git a/examples/torch/hvd1.py b/examples/torch/hvd1.py deleted file mode 100644 index 8475f57..0000000 --- a/examples/torch/hvd1.py +++ /dev/null @@ -1,92 +0,0 @@ -from __future__ import print_function -from multiprocessing import freeze_support -import sys -sys.path.append("/content/") -sys.path.append("/content/grace/") -sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/") -sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") -import argparse -import sys -import os -# print(sys.path) -sys.path.append(os.path.join('/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/', 'grace/')) -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torch.utils.data.distributed -from torchvision import datasets, transforms -import horovod -import horovod.torch as hvd -from grace.grace_dl.torch.communicator.allgather import Allgather -from grace.grace_dl.torch.communicator.allreduce import Allreduce -from grace.grace_dl.torch.compressor.topk import TopKCompressor -from grace.grace_dl.torch.memory.residual import ResidualMemory -import torch -# Training settings -parser = argparse.ArgumentParser(description='PyTorch MNIST Example') -parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') -parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') -parser.add_argument('--epochs', type=int, default=2, metavar='N', - help='number of epochs to train (default: 10)') -parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') -parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') -parser.add_argument('--seed', type=int, default=42, metavar='S', - help='random seed (default: 42)') -parser.add_argument('--log-interval', type=int, default=10, metavar='N', - help='how many batches to wait before logging training status') -parser.add_argument('--fp16-allreduce', action='store_true', default=False, - help='use fp16 compression during allreduce') -parser.add_argument('--num-proc', type=int) -# parser.add_argument('--hosts', help='hosts to run on in notation: hostname:slots[,host2:slots[,...]]') -parser.add_argument('--communication', help='collaborative communication to use: gloo, mpi') -parser.add_argument('--use-adasum', action='store_true', default=False, - help='use adasum algorithm to do reduction') - -parser.add_argument('--gradient-predivide-factor', type=float, default=1.0, - help='apply gradient predivide factor in optimizer (default: 1.0)') - -args = parser.parse_args() -args.cuda = not args.no_cuda and torch.cuda.is_available() -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x) - - -if __name__ == '__main__': - freeze_support() - args = parser.parse_args() - args.cuda = not args.no_cuda and torch.cuda.is_available() - import grace.examples.torch.pytorch_mnist as pmt - if args.num_proc: - # run training through horovod.run - print('Running training through horovod.run1') - ans=horovod.run(pmt.mainf, - args=(args,), - np=args.num_proc, - # hosts=args.hosts, - use_gloo=True, - use_mpi=args.communication == 'mpi') - print("ans",ans) - else: - # this is running via horovodrun - main(args) \ No newline at end of file diff --git a/examples/torch/pytorch_mnist.py b/examples/torch/pytorch_mnist.py index 82c53cf..00b71e5 100644 --- a/examples/torch/pytorch_mnist.py +++ b/examples/torch/pytorch_mnist.py @@ -1,57 +1,78 @@ from __future__ import print_function -import sys -sys.path.append("/content/") -sys.path.append("/content/grace/") -sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/") -sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") + import argparse -import sys -import os -# print(sys.path) -sys.path.append(os.path.join('/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/', 'grace/')) + import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.utils.data.distributed from torchvision import datasets, transforms -import horovod + import horovod.torch as hvd -from grace.grace_dl.torch.communicator.allgather import Allgather -from grace.grace_dl.torch.communicator.allreduce import Allreduce -from grace.grace_dl.torch.compressor.topk import TopKCompressor -from grace.grace_dl.torch.memory.residual import ResidualMemory -import torch +from grace_dl.torch.communicator.allgather import Allgather +from grace_dl.torch.compressor.topk import TopKCompressor +from grace_dl.torch.memory.residual import ResidualMemory + # Training settings -# parser = argparse.ArgumentParser(description='PyTorch MNIST Example') -# parser.add_argument('--batch-size', type=int, default=64, metavar='N', -# help='input batch size for training (default: 64)') -# parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', -# help='input batch size for testing (default: 1000)') -# parser.add_argument('--epochs', type=int, default=2, metavar='N', -# help='number of epochs to train (default: 10)') -# parser.add_argument('--lr', type=float, default=0.01, metavar='LR', -# help='learning rate (default: 0.01)') -# parser.add_argument('--momentum', type=float, default=0.5, metavar='M', -# help='SGD momentum (default: 0.5)') -# parser.add_argument('--no-cuda', action='store_true', default=False, -# help='disables CUDA training') -# parser.add_argument('--seed', type=int, default=42, metavar='S', -# help='random seed (default: 42)') -# parser.add_argument('--log-interval', type=int, default=10, metavar='N', -# help='how many batches to wait before logging training status') -# parser.add_argument('--fp16-allreduce', action='store_true', default=False, -# help='use fp16 compression during allreduce') -# parser.add_argument('--num-proc', type=int) -# # parser.add_argument('--hosts', help='hosts to run on in notation: hostname:slots[,host2:slots[,...]]') -# parser.add_argument('--communication', help='collaborative communication to use: gloo, mpi') -# parser.add_argument('--use-adasum', action='store_true', default=False, -# help='use adasum algorithm to do reduction') +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') +parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') +parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') +parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') +parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') +parser.add_argument('--seed', type=int, default=42, metavar='S', + help='random seed (default: 42)') +parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') +parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') +args = parser.parse_args() +args.cuda = not args.no_cuda and torch.cuda.is_available() + +# Horovod: initialize library. +hvd.init() +torch.manual_seed(args.seed) + +if args.cuda: + # Horovod: pin GPU to local rank. + torch.cuda.set_device(hvd.local_rank()) + torch.cuda.manual_seed(args.seed) + +# Horovod: limit # of CPU threads to be used per worker. +torch.set_num_threads(1) + +kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} +train_dataset = \ + datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) +# Horovod: use DistributedSampler to partition the training data. +train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) +train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) + +test_dataset = \ + datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) +# Horovod: use DistributedSampler to partition the test data. +test_sampler = torch.utils.data.distributed.DistributedSampler( + test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) +test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, + sampler=test_sampler, **kwargs) -# parser.add_argument('--gradient-predivide-factor', type=float, default=1.0, -# help='apply gradient predivide factor in optimizer (default: 1.0)') -# args = parser.parse_args() -# args.cuda = not args.no_cuda and torch.cuda.is_available() class Net(nn.Module): def __init__(self): super(Net, self).__init__() @@ -70,221 +91,86 @@ def forward(self, x): x = self.fc2(x) return F.log_softmax(x) -def mainf(args): - # Horovod: initialize library. - even_set = hvd.ProcessSet([0,2]) - odd_set = hvd.ProcessSet([1,3]) - hvd.init(process_sets=[even_set, odd_set]) - torch.manual_seed(args.seed) - # print("hvd.local_rank()",hvd.local_rank()) - # print("hvd.global_rank()",hvd.global_rank())hvd. - # print("hvd.rank()",hvd.rank()) - if args.cuda: - # Horovod: pin GPU to local rank. - torch.cuda.set_device(hvd.local_rank()) - torch.cuda.manual_seed(args.seed) - - # Horovod: limit # of CPU threads to be used per worker. - torch.set_num_threads(4) - - kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} - train_dataset = \ - datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - # Horovod: use DistributedSampler to partition the training data. - train_sampler = torch.utils.data.distributed.DistributedSampler( - train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) - train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) - - test_dataset = \ - datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - # Horovod: use DistributedSampler to partition the test data. - test_sampler = torch.utils.data.distributed.DistributedSampler( - test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) - test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, - sampler=test_sampler, **kwargs) - - - - model = Net() - model1=Net() - if args.cuda: - # Move model to GPU. - model.cuda() - model1.cuda() - - # Horovod: scale learning rate by the number of GPUs. - optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), - momentum=args.momentum) - optimizer1 = optim.SGD(model1.parameters(), lr=args.lr * hvd.size(), - momentum=args.momentum) - - # Horovod: broadcast parameters & optimizer state. - hvd.broadcast_parameters(model.state_dict(), root_rank=0) - hvd.broadcast_parameters(model1.state_dict(), root_rank=0) - hvd.broadcast_optimizer_state(optimizer, root_rank=0)#,process_set=even_set) - hvd.broadcast_optimizer_state(optimizer1, root_rank=0)#,process_set=even_set) - # GRACE: compression algorithm. - # grc = Allgather(TopKCompressor(0.3), ResidualMemory(), hvd.size()) - - from grace.grace_dl.torch.helper import grace_from_params - params = {'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather'} - grc = grace_from_params(params) - # print("grc",grc) - - # Horovod: wrap optimizer with DistributedOptimizer. - # compression = Allgather(TopKCompressor(0.01), NoneMemory()) - # compression = Allreduce(RandomKCompressor(), ResidualMemory()) - # hvd.Compression.fp16 - - # - from grace.grace_dl.torch.compressor.randomk import RandomKCompressor - cpr=TopKCompressor(0.1) - # if(hvd.rank()%2==0): - # optimizer = hvd.DistributedOptimizer(optimizer, model.named_parameters(),grace=grc)#,process_set=even_set#hvd.Compression.fp16) - # else: - # optimizer = hvd.DistributedOptimizer(optimizer, model.named_parameters(),grace=grc,process_set=odd_set)#hvd.Compression.fp16) - - def train(epoch): - - # Horovod: set epoch to sampler for shuffling. - # opt_params=[] - # opt_params.append({"params": layer.parameters(), "lr": lr}) - # optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), - # momentum=args.momentum) - optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), - momentum=args.momentum) - optimizer1 = optim.SGD(model1.parameters(), lr=args.lr * hvd.size(), - momentum=args.momentum) - compression = Allgather(TopKCompressor(0.01), ResidualMemory(), hvd.size()) - if(hvd.rank()%2==0): - - optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), - momentum=args.momentum) - optimizer = hvd.DistributedOptimizer(optimizer, model.named_parameters(),grace=grc,process_set=even_set,op=hvd.Adasum if args.use_adasum else hvd.Average,gradient_predivide_factor=args.gradient_predivide_factor)#hvd.Compression.fp16) - model.train() - else: - - optimizer1 = optim.SGD(model1.parameters(), lr=args.lr * hvd.size(), - momentum=args.momentum) - optimizer1 = hvd.DistributedOptimizer(optimizer1, model1.named_parameters(),grace=grc,process_set=odd_set,op=hvd.Adasum if args.use_adasum else hvd.Average,gradient_predivide_factor=args.gradient_predivide_factor)#hvd.Compression.fp16) - model1.train() - - train_sampler.set_epoch(epoch) - for batch_idx, (data, target) in enumerate(train_loader): - if args.cuda: - data, target = data.cuda(), target.cuda() - - - if(hvd.rank()%2==0): - optimizer.zero_grad() - output = model(data) - else : - optimizer1.zero_grad() - output = model1(data) - loss = F.nll_loss(output, target) - loss.backward() - if(hvd.rank()%2==0): - optimizer.step() - else : - optimizer1.step() - - if batch_idx % args.log_interval == 0: - # Horovod: use train_sampler to determine the number of examples in - # this worker's partition. - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) - - - def metric_average(val, name): - tensor = torch.tensor(val)#.clone().detach().requires_grad_(True) - # tensor1 = torch.tensor(val).clone().detach().requires_grad_(True) - # avg_tensor1 = hvd.allreduce(tensor, name=name,process_set=even_set) - if(hvd.rank()%2==0): - # print("EVEN reduce") - avg_tensor = hvd.allreduce(tensor, name=name,process_set=even_set) - # print("EVEN reduce completed") - return avg_tensor.item() - else: - # print("ODD reduce") - - avg_tensor1 = hvd.allreduce(tensor, name=name,process_set=odd_set) - # print("ODD reduce completed") - return avg_tensor1.item() - # - # print(avg_tensor1.item(),avg_tensor.item()) - - - - - def test(): - if(hvd.rank()%2==0): - model.eval() - else: - model1.eval() - test_loss = 0. - test_accuracy = 0. - for data, target in test_loader: - if args.cuda: - data, target = data.cuda(), target.cuda() - if(hvd.rank()%2==0): - output = model(data) - else : - output = model1(data) - # sum up batch loss - test_loss += F.nll_loss(output, target, size_average=False).item() - # get the index of the max log-probability - pred = output.data.max(1, keepdim=True)[1] - test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() - - # Horovod: use test_sampler to determine the number of examples in - # this worker's partition. - test_loss /= len(test_sampler) - test_accuracy /= len(test_sampler) - - # Horovod: average metric values across workers. - - # if(hvd.rank()%2==0): - test_loss = metric_average(test_loss, 'avg_loss') - test_accuracy = metric_average(test_accuracy, 'avg_accuracy') - # else: - # test_loss = metric_average(test_loss, 'avg_loss1') - # test_accuracy = metric_average(test_accuracy, 'avg_accuracy1') - - # Horovod: print output only on first rank. - # if hvd.rank() == 0: +model = Net() + +if args.cuda: + # Move model to GPU. + model.cuda() + +# Horovod: scale learning rate by the number of GPUs. +optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), + momentum=args.momentum) + +# Horovod: broadcast parameters & optimizer state. +hvd.broadcast_parameters(model.state_dict(), root_rank=0) +hvd.broadcast_optimizer_state(optimizer, root_rank=0) + +# GRACE: compression algorithm. +# grc = Allgather(TopKCompressor(0.3), ResidualMemory(), hvd.size()) + +from grace_dl.torch.helper import grace_from_params +params = {'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather'} +grc = grace_from_params(params) + +# Horovod: wrap optimizer with DistributedOptimizer. +optimizer = hvd.DistributedOptimizer(optimizer, grc, named_parameters=model.named_parameters()) + + +def train(epoch): + model.train() + # Horovod: set epoch to sampler for shuffling. + train_sampler.set_epoch(epoch) + for batch_idx, (data, target) in enumerate(train_loader): + if args.cuda: + data, target = data.cuda(), target.cuda() + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + # Horovod: use train_sampler to determine the number of examples in + # this worker's partition. + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) + + +def metric_average(val, name): + tensor = torch.tensor(val) + avg_tensor = hvd.allreduce(tensor, name=name) + return avg_tensor.item() + + +def test(): + model.eval() + test_loss = 0. + test_accuracy = 0. + for data, target in test_loader: + if args.cuda: + data, target = data.cuda(), target.cuda() + output = model(data) + # sum up batch loss + test_loss += F.nll_loss(output, target, size_average=False).item() + # get the index of the max log-probability + pred = output.data.max(1, keepdim=True)[1] + test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() + + # Horovod: use test_sampler to determine the number of examples in + # this worker's partition. + test_loss /= len(test_sampler) + test_accuracy /= len(test_sampler) + + # Horovod: average metric values across workers. + test_loss = metric_average(test_loss, 'avg_loss') + test_accuracy = metric_average(test_accuracy, 'avg_accuracy') + + # Horovod: print output only on first rank. + if hvd.rank() == 0: print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( - test_loss, 100. * test_accuracy)) - import time - time.sleep(5) - + test_loss, 100. * test_accuracy)) - for epoch in range(1, args.epochs + 1): - train(epoch) - # if(hvd.rank()%2==0): - test() - return "1" -if __name__ == '__main__': - args = parser.parse_args() - args.cuda = not args.no_cuda and torch.cuda.is_available() - if args.num_proc: - # run training through horovod.run - print('Running training through horovod.run') - horovod.run(mainf, - args=(args,), - np=args.num_proc, - hosts=args.hosts, - use_gloo=args.communication == 'gloo', - use_mpi=args.communication == 'mpi') - else: - # this is running via horovodrun - main(args) \ No newline at end of file +for epoch in range(1, args.epochs + 1): + train(epoch) + test() diff --git a/grace_dl/.DS_Store b/grace_dl/.DS_Store index 7c72073262c1b15b1b9e63219482b21ec115091a..f530e2364144b4e446d40dbda61eafc6e2c1a79a 100644 GIT binary patch delta 37 scmZoMXfc?Oz{t2UaX&L7<7P#cQbtC`$@^I)Hm_%$3uJEgW#29U0Mhab#Q*>R delta 40 vcmZoMXfc?Ouvw5}HS=U{7Ewlq&5A6gjEoGE_p?fDUe7ug$lUD9zFhzS>f{R@ diff --git a/grace_dl/torch/.DS_Store b/grace_dl/torch/.DS_Store index 1b1428a405518204e00c4423c42b291248b8985c..2715dccc4ff900755cb3c6b8d258e8090206336a 100644 GIT binary patch delta 14 VcmZp1XmQxEQiPFl^D2=jZU8721kL~e delta 14 VcmZp1XmQxEQiPFV^D2=jZU86{1kC^d diff --git a/patch_files/.DS_Store b/patch_files/.DS_Store index dced0a2b4db5d862546c4f16fe24b4921fb03d32..39d79eb7b39ffba85e769fb7c64c8e5683b20d69 100644 GIT binary patch delta 21 ccmZoMXffC@kBLLq)KW*m$jor_GNw3D07oGP4*&oF delta 21 ccmZoMXffC@kBP&?*hojg(A;G6GNw3D07obW5C8xG diff --git a/patch_files/horovod/.DS_Store b/patch_files/horovod/.DS_Store index 94a06d9e8f063fb315ae1ec923cfc7fdb4cfef34..ba87a362e880b0dcac61d55787ec084b695bf218 100644 GIT binary patch delta 21 ccmZoMXffC@i-|+m)KW*m$jor_BBmr!07lsc3jhEB delta 21 ccmZoMXffC@i;2U;*hojg(A;G6BBmr!07l>j3;+NC diff --git a/update_patch_files.sh b/update_patch_files.sh index fe83588..fffe711 100755 --- a/update_patch_files.sh +++ b/update_patch_files.sh @@ -24,6 +24,11 @@ then fi +cp -v patch_files/horovod/tensorflow/__init__.py ${HOROVOD_PATH}/tensorflow/ +cp -v patch_files/horovod/tensorflow/keras/__init__.py ${HOROVOD_PATH}/tensorflow/keras/ +cp -v patch_files/horovod/_keras/__init__.py ${HOROVOD_PATH}/_keras/ cp -v patch_files/horovod/torch/__init__.py ${HOROVOD_PATH}/torch/ cp -v patch_files/horovod/torch/mpi_ops.py ${HOROVOD_PATH}/torch/ + +cp -v patch_files/horovod/keras/__init__.py ${HOROVOD_PATH}/keras/ cp -v patch_files/horovod/torch/optimizer.py ${HOROVOD_PATH}/torch/ \ No newline at end of file From 95c8fb016a6e1a04e9d6c248a043c4748564a720 Mon Sep 17 00:00:00 2001 From: Garvit Banga Date: Mon, 22 Jul 2024 00:52:27 -0400 Subject: [PATCH 08/44] HOROVOD new version compatible for pytorch --- .DS_Store | Bin 8196 -> 8196 bytes grace_dl/tensorflow/communicator/allgather.py | 2 +- grace_dl/tensorflow/compressor/topk.py | 2 +- grace_dl/tensorflow/memory/residual.py | 6 +- grace_dl/torch/__init__.py | 12 +- grace_dl/torch/communicator/allgather.py | 15 +- grace_dl/torch/communicator/allreduce.py | 2 +- grace_dl/torch/compressor/fp16.py | 6 +- grace_dl/torch/compressor/randomk.py | 2 +- grace_dl/torch/compressor/topk.py | 5 +- grace_dl/torch/helper.py | 9 +- grace_dl/torch/memory/residual.py | 5 +- patch_files/horovod/_keras/__init__.py | 183 ++ patch_files/horovod/keras/__init__.py | 183 ++ patch_files/horovod/tensorflow/__init__.py | 761 ++++++ .../horovod/tensorflow/keras/__init__.py | 225 ++ patch_files/horovod/torch/__init__.py | 57 +- patch_files/horovod/torch/mpi_ops.py | 712 +----- patch_files/horovod/torch/optimizer.py | 173 +- untitled folder/.DS_Store | Bin 0 -> 6148 bytes untitled folder/grace_dl/.DS_Store | Bin 0 -> 6148 bytes untitled folder/grace_dl/__init__.py | 1 + untitled folder/grace_dl/dist/.DS_Store | Bin 0 -> 6148 bytes untitled folder/grace_dl/dist/__init__.py | 51 + .../grace_dl/dist/communicator/all_to_all.py | 124 + .../grace_dl/dist/communicator/allgather.py | 45 + .../grace_dl/dist/communicator/allreduce.py | 13 + .../grace_dl/dist/communicator/broadcast.py | 33 + .../dist/compressor/cnat_cuda/cnat.cpp | 29 + .../dist/compressor/cnat_cuda/cnat_cuda.cu | 184 ++ .../dist/compressor/cnat_cuda/setup.py | 14 + .../grace_dl/dist/compressor/dgc.py | 50 + .../grace_dl/dist/compressor/efsignsgd.py | 33 + .../grace_dl/dist/compressor/fp16.py | 22 + .../grace_dl/dist/compressor/natural.py | 57 + .../grace_dl/dist/compressor/none.py | 12 + .../grace_dl/dist/compressor/onebit.py | 31 + .../grace_dl/dist/compressor/powersgd.py | 65 + .../grace_dl/dist/compressor/qsgd.py | 79 + .../dist/compressor/qsgd_cuda/example.py | 16 + .../dist/compressor/qsgd_cuda/qsgd.cpp | 24 + .../dist/compressor/qsgd_cuda/qsgd_cuda.cu | 538 +++++ .../dist/compressor/qsgd_cuda/setup.py | 14 + .../cub/agent/agent_histogram.cuh | 787 ++++++ .../cub/agent/agent_radix_sort_downsweep.cuh | 772 ++++++ .../cub/agent/agent_radix_sort_upsweep.cuh | 526 ++++ .../radixtopk_cuda/cub/agent/agent_reduce.cuh | 385 +++ .../cub/agent/agent_reduce_by_key.cuh | 549 +++++ .../radixtopk_cuda/cub/agent/agent_rle.cuh | 837 +++++++ .../radixtopk_cuda/cub/agent/agent_scan.cuh | 471 ++++ .../cub/agent/agent_segment_fixup.cuh | 375 +++ .../cub/agent/agent_select_if.cuh | 703 ++++++ .../cub/agent/agent_spmv_orig.cuh | 925 +++++++ .../cub/agent/single_pass_scan_operators.cuh | 815 +++++++ .../cub/block/block_adjacent_difference.cuh | 596 +++++ .../cub/block/block_discontinuity.cuh | 1148 +++++++++ .../cub/block/block_exchange.cuh | 1248 ++++++++++ .../cub/block/block_histogram.cuh | 415 ++++ .../radixtopk_cuda/cub/block/block_load.cuh | 1268 ++++++++++ .../cub/block/block_radix_rank.cuh | 697 ++++++ .../cub/block/block_radix_sort.cuh | 862 +++++++ .../cub/block/block_raking_layout.cuh | 152 ++ .../radixtopk_cuda/cub/block/block_reduce.cuh | 607 +++++ .../radixtopk_cuda/cub/block/block_scan.cuh | 2126 +++++++++++++++++ .../cub/block/block_shuffle.cuh | 305 +++ .../radixtopk_cuda/cub/block/block_store.cuh | 1000 ++++++++ .../block_histogram_atomic.cuh | 82 + .../specializations/block_histogram_sort.cuh | 226 ++ .../specializations/block_reduce_raking.cuh | 222 ++ .../block_reduce_raking_commutative_only.cuh | 199 ++ .../block_reduce_warp_reductions.cuh | 222 ++ .../specializations/block_scan_raking.cuh | 666 ++++++ .../specializations/block_scan_warp_scans.cuh | 392 +++ .../block_scan_warp_scans2.cuh | 436 ++++ .../block_scan_warp_scans3.cuh | 418 ++++ .../compressor/radixtopk_cuda/cub/cub.cuh | 95 + .../cub/device/device_histogram.cuh | 866 +++++++ .../cub/device/device_partition.cuh | 273 +++ .../cub/device/device_radix_sort.cuh | 796 ++++++ .../cub/device/device_reduce.cuh | 734 ++++++ .../cub/device/device_run_length_encode.cuh | 278 +++ .../radixtopk_cuda/cub/device/device_scan.cuh | 443 ++++ .../device/device_segmented_radix_sort.cuh | 875 +++++++ .../cub/device/device_segmented_reduce.cuh | 619 +++++ .../cub/device/device_select.cuh | 369 +++ .../radixtopk_cuda/cub/device/device_spmv.cuh | 174 ++ .../device/dispatch/dispatch_histogram.cuh | 1096 +++++++++ .../device/dispatch/dispatch_radix_sort.cuh | 1657 +++++++++++++ .../cub/device/dispatch/dispatch_reduce.cuh | 882 +++++++ .../dispatch/dispatch_reduce_by_key.cuh | 554 +++++ .../cub/device/dispatch/dispatch_rle.cuh | 538 +++++ .../cub/device/dispatch/dispatch_scan.cuh | 563 +++++ .../device/dispatch/dispatch_select_if.cuh | 542 +++++ .../device/dispatch/dispatch_spmv_orig.cuh | 850 +++++++ .../radixtopk_cuda/cub/grid/grid_barrier.cuh | 211 ++ .../cub/grid/grid_even_share.cuh | 222 ++ .../radixtopk_cuda/cub/grid/grid_mapping.cuh | 113 + .../radixtopk_cuda/cub/grid/grid_queue.cuh | 220 ++ .../radixtopk_cuda/cub/host/mutex.cuh | 171 ++ .../cub/iterator/arg_index_input_iterator.cuh | 259 ++ .../cache_modified_input_iterator.cuh | 240 ++ .../cache_modified_output_iterator.cuh | 254 ++ .../cub/iterator/constant_input_iterator.cuh | 235 ++ .../cub/iterator/counting_input_iterator.cuh | 228 ++ .../cub/iterator/discard_output_iterator.cuh | 220 ++ .../cub/iterator/tex_obj_input_iterator.cuh | 310 +++ .../cub/iterator/tex_ref_input_iterator.cuh | 374 +++ .../cub/iterator/transform_input_iterator.cuh | 252 ++ .../radixtopk_cuda/cub/thread/thread_load.cuh | 438 ++++ .../cub/thread/thread_operators.cuh | 317 +++ .../cub/thread/thread_reduce.cuh | 152 ++ .../radixtopk_cuda/cub/thread/thread_scan.cuh | 268 +++ .../cub/thread/thread_search.cuh | 154 ++ .../cub/thread/thread_store.cuh | 422 ++++ .../radixtopk_cuda/cub/util_allocator.cuh | 708 ++++++ .../radixtopk_cuda/cub/util_arch.cuh | 151 ++ .../radixtopk_cuda/cub/util_debug.cuh | 145 ++ .../radixtopk_cuda/cub/util_device.cuh | 347 +++ .../radixtopk_cuda/cub/util_macro.cuh | 103 + .../radixtopk_cuda/cub/util_namespace.cuh | 46 + .../radixtopk_cuda/cub/util_ptx.cuh | 729 ++++++ .../radixtopk_cuda/cub/util_type.cuh | 1141 +++++++++ .../warp/specializations/warp_reduce_shfl.cuh | 551 +++++ .../warp/specializations/warp_reduce_smem.cuh | 375 +++ .../warp/specializations/warp_scan_shfl.cuh | 656 +++++ .../warp/specializations/warp_scan_smem.cuh | 397 +++ .../radixtopk_cuda/cub/warp/warp_reduce.cuh | 612 +++++ .../radixtopk_cuda/cub/warp/warp_scan.cuh | 936 ++++++++ .../dist/compressor/radixtopk_cuda/example.py | 31 + .../compressor/radixtopk_cuda/rdxtopk.cpp | 18 + .../compressor/radixtopk_cuda/rdxtopk_cuda.cu | 582 +++++ .../dist/compressor/radixtopk_cuda/setup.py | 14 + .../grace_dl/dist/compressor/randomk.py | 41 + .../grace_dl/dist/compressor/signsgd.py | 30 + .../grace_dl/dist/compressor/signum.py | 37 + .../grace_dl/dist/compressor/terngrad.py | 32 + .../grace_dl/dist/compressor/threshold.py | 27 + .../grace_dl/dist/compressor/topk.py | 69 + untitled folder/grace_dl/dist/helper.py | 102 + untitled folder/grace_dl/dist/memory/dgc.py | 39 + .../grace_dl/dist/memory/efsignsgd.py | 19 + untitled folder/grace_dl/dist/memory/none.py | 11 + .../grace_dl/dist/memory/powersgd.py | 37 + .../grace_dl/dist/memory/residual.py | 20 + untitled folder/grace_dl/tensorflow/.DS_Store | Bin 0 -> 8196 bytes .../grace_dl/tensorflow/__init__.py | 114 + .../tensorflow/communicator/.DS_Store | Bin 0 -> 6148 bytes .../tensorflow/communicator/allgather.py | 46 + .../tensorflow/communicator/allreduce.py | 20 + .../tensorflow/communicator/broadcast.py | 19 + .../grace_dl/tensorflow/compressor/.DS_Store | Bin 0 -> 6148 bytes .../grace_dl/tensorflow/compressor/adaq.py | 93 + .../grace_dl/tensorflow/compressor/dgc.py | 68 + .../tensorflow/compressor/efsignsgd.py | 40 + .../grace_dl/tensorflow/compressor/fp16.py | 25 + .../tensorflow/compressor/inceptionn.py | 188 ++ .../grace_dl/tensorflow/compressor/natural.py | 42 + .../grace_dl/tensorflow/compressor/none.py | 14 + .../grace_dl/tensorflow/compressor/onebit.py | 42 + .../grace_dl/tensorflow/compressor/packing.py | 30 + .../tensorflow/compressor/powersgd.py | 58 + .../grace_dl/tensorflow/compressor/qsgd.py | 44 + .../grace_dl/tensorflow/compressor/randomk.py | 53 + .../grace_dl/tensorflow/compressor/signsgd.py | 30 + .../grace_dl/tensorflow/compressor/signum.py | 45 + .../grace_dl/tensorflow/compressor/sketch.py | 39 + .../tensorflow/compressor/terngrad.py | 42 + .../tensorflow/compressor/threshold.py | 33 + .../grace_dl/tensorflow/compressor/topk.py | 49 + .../grace_dl/tensorflow/compressor/u8bit.py | 110 + untitled folder/grace_dl/tensorflow/helper.py | 107 + .../grace_dl/tensorflow/memory/.DS_Store | Bin 0 -> 6148 bytes .../grace_dl/tensorflow/memory/dgc.py | 38 + .../grace_dl/tensorflow/memory/efsignsgd.py | 24 + .../grace_dl/tensorflow/memory/none.py | 11 + .../grace_dl/tensorflow/memory/powersgd.py | 38 + .../grace_dl/tensorflow/memory/residual.py | 28 + untitled folder/grace_dl/torch/.DS_Store | Bin 0 -> 8196 bytes untitled folder/grace_dl/torch/__init__.py | 58 + .../grace_dl/torch/communicator/.DS_Store | Bin 0 -> 6148 bytes .../grace_dl/torch/communicator/allgather.py | 58 + .../grace_dl/torch/communicator/allreduce.py | 15 + .../grace_dl/torch/communicator/broadcast.py | 27 + .../grace_dl/torch/compressor/.DS_Store | Bin 0 -> 6148 bytes .../grace_dl/torch/compressor/dgc.py | 50 + .../grace_dl/torch/compressor/efsignsgd.py | 33 + .../grace_dl/torch/compressor/fp16.py | 22 + .../grace_dl/torch/compressor/natural.py | 40 + .../grace_dl/torch/compressor/none.py | 12 + .../grace_dl/torch/compressor/onebit.py | 32 + .../grace_dl/torch/compressor/powersgd.py | 58 + .../grace_dl/torch/compressor/qsgd.py | 39 + .../grace_dl/torch/compressor/randomk.py | 40 + .../grace_dl/torch/compressor/signsgd.py | 30 + .../grace_dl/torch/compressor/signum.py | 37 + .../grace_dl/torch/compressor/terngrad.py | 32 + .../grace_dl/torch/compressor/threshold.py | 27 + .../grace_dl/torch/compressor/topk.py | 37 + untitled folder/grace_dl/torch/helper.py | 96 + .../grace_dl/torch/memory/.DS_Store | Bin 0 -> 6148 bytes untitled folder/grace_dl/torch/memory/dgc.py | 38 + .../grace_dl/torch/memory/efsignsgd.py | 19 + untitled folder/grace_dl/torch/memory/none.py | 11 + .../grace_dl/torch/memory/powersgd.py | 37 + .../grace_dl/torch/memory/residual.py | 23 + untitled folder/patch_files/.DS_Store | Bin 0 -> 6148 bytes untitled folder/patch_files/horovod/.DS_Store | Bin 0 -> 6148 bytes .../patch_files/horovod/torch/.DS_Store | Bin 0 -> 6148 bytes .../patch_files/horovod/torch/__init__.py | 61 + .../patch_files/horovod/torch/mpi_ops.py | 1333 +++++++++++ .../patch_files/horovod/torch/optimizer.py | 618 +++++ 211 files changed, 53068 insertions(+), 802 deletions(-) create mode 100644 patch_files/horovod/_keras/__init__.py create mode 100644 patch_files/horovod/keras/__init__.py create mode 100644 patch_files/horovod/tensorflow/__init__.py create mode 100644 patch_files/horovod/tensorflow/keras/__init__.py create mode 100644 untitled folder/.DS_Store create mode 100644 untitled folder/grace_dl/.DS_Store create mode 100644 untitled folder/grace_dl/__init__.py create mode 100644 untitled folder/grace_dl/dist/.DS_Store create mode 100644 untitled folder/grace_dl/dist/__init__.py create mode 100644 untitled folder/grace_dl/dist/communicator/all_to_all.py create mode 100644 untitled folder/grace_dl/dist/communicator/allgather.py create mode 100644 untitled folder/grace_dl/dist/communicator/allreduce.py create mode 100644 untitled folder/grace_dl/dist/communicator/broadcast.py create mode 100644 untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat.cpp create mode 100644 untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat_cuda.cu create mode 100644 untitled folder/grace_dl/dist/compressor/cnat_cuda/setup.py create mode 100644 untitled folder/grace_dl/dist/compressor/dgc.py create mode 100644 untitled folder/grace_dl/dist/compressor/efsignsgd.py create mode 100644 untitled folder/grace_dl/dist/compressor/fp16.py create mode 100644 untitled folder/grace_dl/dist/compressor/natural.py create mode 100644 untitled folder/grace_dl/dist/compressor/none.py create mode 100644 untitled folder/grace_dl/dist/compressor/onebit.py create mode 100644 untitled folder/grace_dl/dist/compressor/powersgd.py create mode 100644 untitled folder/grace_dl/dist/compressor/qsgd.py create mode 100644 untitled folder/grace_dl/dist/compressor/qsgd_cuda/example.py create mode 100644 untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd.cpp create mode 100644 untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd_cuda.cu create mode 100644 untitled folder/grace_dl/dist/compressor/qsgd_cuda/setup.py create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_histogram.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_downsweep.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_upsweep.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce_by_key.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_rle.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_scan.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_segment_fixup.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_select_if.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_spmv_orig.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/single_pass_scan_operators.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_adjacent_difference.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_discontinuity.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_exchange.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_histogram.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_load.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_rank.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_sort.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_raking_layout.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_reduce.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_scan.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_shuffle.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_store.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_atomic.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_sort.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_warp_reductions.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_raking.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans2.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans3.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/cub.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_histogram.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_partition.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_radix_sort.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_reduce.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_run_length_encode.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_scan.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_radix_sort.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_reduce.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_select.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_spmv.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_histogram.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_radix_sort.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_rle.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_scan.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_select_if.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_spmv_orig.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_barrier.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_even_share.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_mapping.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_queue.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/host/mutex.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/arg_index_input_iterator.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_input_iterator.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_output_iterator.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/constant_input_iterator.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/counting_input_iterator.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/discard_output_iterator.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_obj_input_iterator.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_ref_input_iterator.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/transform_input_iterator.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_load.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_operators.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_reduce.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_scan.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_search.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_store.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_allocator.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_arch.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_debug.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_device.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_macro.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_namespace.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_ptx.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_type.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_shfl.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_smem.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_shfl.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_smem.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_reduce.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_scan.cuh create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/example.py create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk.cpp create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk_cuda.cu create mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/setup.py create mode 100644 untitled folder/grace_dl/dist/compressor/randomk.py create mode 100644 untitled folder/grace_dl/dist/compressor/signsgd.py create mode 100644 untitled folder/grace_dl/dist/compressor/signum.py create mode 100644 untitled folder/grace_dl/dist/compressor/terngrad.py create mode 100644 untitled folder/grace_dl/dist/compressor/threshold.py create mode 100644 untitled folder/grace_dl/dist/compressor/topk.py create mode 100644 untitled folder/grace_dl/dist/helper.py create mode 100644 untitled folder/grace_dl/dist/memory/dgc.py create mode 100644 untitled folder/grace_dl/dist/memory/efsignsgd.py create mode 100644 untitled folder/grace_dl/dist/memory/none.py create mode 100644 untitled folder/grace_dl/dist/memory/powersgd.py create mode 100644 untitled folder/grace_dl/dist/memory/residual.py create mode 100644 untitled folder/grace_dl/tensorflow/.DS_Store create mode 100644 untitled folder/grace_dl/tensorflow/__init__.py create mode 100644 untitled folder/grace_dl/tensorflow/communicator/.DS_Store create mode 100644 untitled folder/grace_dl/tensorflow/communicator/allgather.py create mode 100644 untitled folder/grace_dl/tensorflow/communicator/allreduce.py create mode 100644 untitled folder/grace_dl/tensorflow/communicator/broadcast.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/.DS_Store create mode 100644 untitled folder/grace_dl/tensorflow/compressor/adaq.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/dgc.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/efsignsgd.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/fp16.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/inceptionn.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/natural.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/none.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/onebit.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/packing.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/powersgd.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/qsgd.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/randomk.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/signsgd.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/signum.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/sketch.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/terngrad.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/threshold.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/topk.py create mode 100644 untitled folder/grace_dl/tensorflow/compressor/u8bit.py create mode 100644 untitled folder/grace_dl/tensorflow/helper.py create mode 100644 untitled folder/grace_dl/tensorflow/memory/.DS_Store create mode 100644 untitled folder/grace_dl/tensorflow/memory/dgc.py create mode 100644 untitled folder/grace_dl/tensorflow/memory/efsignsgd.py create mode 100644 untitled folder/grace_dl/tensorflow/memory/none.py create mode 100644 untitled folder/grace_dl/tensorflow/memory/powersgd.py create mode 100644 untitled folder/grace_dl/tensorflow/memory/residual.py create mode 100644 untitled folder/grace_dl/torch/.DS_Store create mode 100644 untitled folder/grace_dl/torch/__init__.py create mode 100644 untitled folder/grace_dl/torch/communicator/.DS_Store create mode 100644 untitled folder/grace_dl/torch/communicator/allgather.py create mode 100644 untitled folder/grace_dl/torch/communicator/allreduce.py create mode 100644 untitled folder/grace_dl/torch/communicator/broadcast.py create mode 100644 untitled folder/grace_dl/torch/compressor/.DS_Store create mode 100644 untitled folder/grace_dl/torch/compressor/dgc.py create mode 100644 untitled folder/grace_dl/torch/compressor/efsignsgd.py create mode 100644 untitled folder/grace_dl/torch/compressor/fp16.py create mode 100644 untitled folder/grace_dl/torch/compressor/natural.py create mode 100644 untitled folder/grace_dl/torch/compressor/none.py create mode 100644 untitled folder/grace_dl/torch/compressor/onebit.py create mode 100644 untitled folder/grace_dl/torch/compressor/powersgd.py create mode 100644 untitled folder/grace_dl/torch/compressor/qsgd.py create mode 100644 untitled folder/grace_dl/torch/compressor/randomk.py create mode 100644 untitled folder/grace_dl/torch/compressor/signsgd.py create mode 100644 untitled folder/grace_dl/torch/compressor/signum.py create mode 100644 untitled folder/grace_dl/torch/compressor/terngrad.py create mode 100644 untitled folder/grace_dl/torch/compressor/threshold.py create mode 100644 untitled folder/grace_dl/torch/compressor/topk.py create mode 100644 untitled folder/grace_dl/torch/helper.py create mode 100644 untitled folder/grace_dl/torch/memory/.DS_Store create mode 100644 untitled folder/grace_dl/torch/memory/dgc.py create mode 100644 untitled folder/grace_dl/torch/memory/efsignsgd.py create mode 100644 untitled folder/grace_dl/torch/memory/none.py create mode 100644 untitled folder/grace_dl/torch/memory/powersgd.py create mode 100644 untitled folder/grace_dl/torch/memory/residual.py create mode 100644 untitled folder/patch_files/.DS_Store create mode 100644 untitled folder/patch_files/horovod/.DS_Store create mode 100644 untitled folder/patch_files/horovod/torch/.DS_Store create mode 100644 untitled folder/patch_files/horovod/torch/__init__.py create mode 100644 untitled folder/patch_files/horovod/torch/mpi_ops.py create mode 100644 untitled folder/patch_files/horovod/torch/optimizer.py diff --git a/.DS_Store b/.DS_Store index bad07e965141e67ad73cab84e3ab9c0ed6fb1380..a5368b17789b4e20b560a7aeab0f7e415f4f348a 100644 GIT binary patch delta 704 zcmZp1XmOa}&nUAoU^hRb%w`?|b5?nNhEj$+h7yKMAkJY(Wk_LAU`S)g2eMOuyrRhp z0u7V>ggojAs7onM&PmG8&tU)p0!nkz4TF>Oa|=L9p@4vn+foy`>QpnI%kb%v48u$XKEVw8yCqFM8=p@F8h1)l?OW2GCEJjkIIAFH`06_qk A8UO$Q delta 32 mcmZp1XmOa}&nUVvU^hRb=w==PbJmR|N0>IVOV|Jz2bln?Dhf~l diff --git a/grace_dl/tensorflow/communicator/allgather.py b/grace_dl/tensorflow/communicator/allgather.py index 4bdba0e..f3904e3 100644 --- a/grace_dl/tensorflow/communicator/allgather.py +++ b/grace_dl/tensorflow/communicator/allgather.py @@ -1,6 +1,6 @@ import tensorflow as tf -from grace.grace_dl.tensorflow import Communicator +from grace_dl.tensorflow import Communicator from horovod.tensorflow import allgather diff --git a/grace_dl/tensorflow/compressor/topk.py b/grace_dl/tensorflow/compressor/topk.py index 773cd77..9de37a7 100644 --- a/grace_dl/tensorflow/compressor/topk.py +++ b/grace_dl/tensorflow/compressor/topk.py @@ -1,6 +1,6 @@ import tensorflow as tf -from grace.grace_dl.tensorflow import Compressor +from grace_dl.tensorflow import Compressor def sparsify(tensor, compress_ratio): diff --git a/grace_dl/tensorflow/memory/residual.py b/grace_dl/tensorflow/memory/residual.py index 00347ce..ccd4dc9 100644 --- a/grace_dl/tensorflow/memory/residual.py +++ b/grace_dl/tensorflow/memory/residual.py @@ -1,6 +1,6 @@ import tensorflow as tf -from grace.grace_dl.tensorflow import Memory +from grace_dl.tensorflow import Memory class ResidualMemory(Memory): @@ -8,8 +8,8 @@ def __init__(self, beta=1.0, gamma=1.0): self.residuals = {} self.beta = beta self.gamma = gamma - for v in tf.compat.v1.trainable_variables(): - self.residuals[v.name] = tf.compat.v1.Variable(tf.zeros_like(v), trainable=False) + for v in tf.trainable_variables(): + self.residuals[v.name] = tf.Variable(tf.zeros_like(v), trainable=False) def compensate(self, tensor, name): """Update the tensor with the residuals.""" diff --git a/grace_dl/torch/__init__.py b/grace_dl/torch/__init__.py index 4030ee4..12900df 100644 --- a/grace_dl/torch/__init__.py +++ b/grace_dl/torch/__init__.py @@ -36,23 +36,23 @@ def aggregate(self, tensors): class Communicator(ABC): @abstractmethod - def async_send(self, tensors, name,process_set): + def async_send(self, tensors, name): raise NotImplemented("async_send was not implemented.") @abstractmethod - def wait_receive(self, handles, ctx,process_set): + def wait_receive(self, handles, ctx): raise NotImplemented("wait_receive was not implemented.") def __init__(self, compressor, memory): self.compressor = compressor self.memory = memory - def send_step(self, tensor, name,process_set): + def send_step(self, tensor, name): tensor = self.memory.compensate(tensor, name) tensors_compressed, ctx = self.compressor.compress(tensor, name) self.memory.update(tensor, name, self.compressor, tensors_compressed, ctx) - handles = self.async_send(tensors_compressed, name,process_set) + handles = self.async_send(tensors_compressed, name) return handles, ctx - def receive_step(self, handles, ctx,process_set): - return self.wait_receive(handles, ctx,process_set) + def receive_step(self, handles, ctx): + return self.wait_receive(handles, ctx) diff --git a/grace_dl/torch/communicator/allgather.py b/grace_dl/torch/communicator/allgather.py index ff95a6c..a598475 100644 --- a/grace_dl/torch/communicator/allgather.py +++ b/grace_dl/torch/communicator/allgather.py @@ -1,6 +1,6 @@ import torch -from grace.grace_dl.torch import Communicator +from grace_dl.torch import Communicator from horovod.torch import allgather, allgather_async, synchronize @@ -9,8 +9,7 @@ def __init__(self, compressor, memory, world_size): super().__init__(compressor, memory) self.world_size = world_size - - def async_send(self, tensors_compressed, name,process_set): + def async_send(self, tensors_compressed, name): """ :param tensors_compressed: list of flat tensors to communicate :param name: for the all_gather operation @@ -27,25 +26,21 @@ def async_send(self, tensors_compressed, name,process_set): tensor_sizes = zip(*tensors_size_ag) # transpose else: tensors_size = torch.tensor(tensors_size) # TODO: set device - gathered = allgather(tensor=tensors_size,process_set=process_set) # tensor of tensor sizes per rank + gathered = allgather(tensors_size) # tensor of tensor sizes per rank tensor_sizes = gathered.view([self.world_size, -1]).t().tolist() # transpose, to list handles = [] for tensor_compressed in tensors_compressed: - handle = allgather_async(tensor=tensor_compressed,process_set=process_set) + handle = allgather_async(tensor_compressed) handles.append(handle) return handles, tensor_sizes - def wait_receive(self, result, ctx,process_set): + def wait_receive(self, result, ctx): handles, tensor_sizes = result - # print("Result",result) tensors_ag = [] for handle, sizes in zip(handles, tensor_sizes): gathered = synchronize(handle) - # print("gathered",gathered) - # print("gathered",len(gathered)) - # print("sizes",sizes) tensors_ag.append(gathered.split(sizes)) # import time # time.sleep(0.001) diff --git a/grace_dl/torch/communicator/allreduce.py b/grace_dl/torch/communicator/allreduce.py index 98e5396..89255e6 100644 --- a/grace_dl/torch/communicator/allreduce.py +++ b/grace_dl/torch/communicator/allreduce.py @@ -1,4 +1,4 @@ -from grace.grace_dl.torch import Communicator +from grace_dl.torch import Communicator from horovod.torch import allreduce_async_, synchronize diff --git a/grace_dl/torch/compressor/fp16.py b/grace_dl/torch/compressor/fp16.py index 5021ae6..db6fe4a 100644 --- a/grace_dl/torch/compressor/fp16.py +++ b/grace_dl/torch/compressor/fp16.py @@ -6,17 +6,17 @@ class FP16Compressor(Compressor): """Compress all floating point gradients to 16-bit.""" - def compress(self, tensor): + def compress(self, tensor, name): """Downcasts the tensor to 16-bit.""" dtype = tensor.dtype if dtype.is_floating_point: # Only allow compression from other floating point types tensor = tensor.type(torch.float16) - return tensor, dtype + return [tensor], dtype def decompress(self, tensors, dtype): """Upcasts the tensor to the initialization dtype.""" - tensor_decompressed = tensors + tensor_decompressed, = tensors if dtype.is_floating_point: tensor_decompressed = tensor_decompressed.type(dtype) return tensor_decompressed diff --git a/grace_dl/torch/compressor/randomk.py b/grace_dl/torch/compressor/randomk.py index c345956..3be19de 100644 --- a/grace_dl/torch/compressor/randomk.py +++ b/grace_dl/torch/compressor/randomk.py @@ -1,6 +1,6 @@ import torch -from grace.grace_dl.torch import Compressor +from grace_dl.torch import Compressor def sparsify(tensor, compress_ratio): diff --git a/grace_dl/torch/compressor/topk.py b/grace_dl/torch/compressor/topk.py index 9ccfc2d..6d2a750 100644 --- a/grace_dl/torch/compressor/topk.py +++ b/grace_dl/torch/compressor/topk.py @@ -1,6 +1,6 @@ import torch -from grace.grace_dl.torch import Compressor +from grace_dl.torch import Compressor def sparsify(tensor, compress_ratio): @@ -13,7 +13,6 @@ def sparsify(tensor, compress_ratio): def desparsify(tensors, numel): values, indices = tensors - # print("values, indices",values, indices,numel) tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) tensor_decompressed.scatter_(0, indices, values) return tensor_decompressed @@ -22,7 +21,7 @@ def desparsify(tensors, numel): class TopKCompressor(Compressor): def __init__(self, compress_ratio): - super().__init__()#tensors_size_are_same=False + super().__init__() self.compress_ratio = compress_ratio def compress(self, tensor, name): diff --git a/grace_dl/torch/helper.py b/grace_dl/torch/helper.py index 0796893..a59a6b9 100644 --- a/grace_dl/torch/helper.py +++ b/grace_dl/torch/helper.py @@ -1,10 +1,6 @@ -def grace_from_params(params,processset): - import sys - sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") - sys.path.append("/content/grace/") +def grace_from_params(params): import horovod.torch as hvd - world_size = processset.size() - # print("world_size",world_size,processset.size()) + world_size = hvd.size() comp = params.get('compressor', 'none') mem = params.get('memory', 'none') comm = params.get('communicator', 'allreduce') @@ -87,7 +83,6 @@ def grace_from_params(params,processset): return Allreduce(compressor, memory) elif comm == 'allgather': from grace_dl.torch.communicator.allgather import Allgather - # print("compressor",compressor) return Allgather(compressor, memory, world_size) elif comm == 'broadcast': from grace_dl.torch.communicator.broadcast import Broadcast diff --git a/grace_dl/torch/memory/residual.py b/grace_dl/torch/memory/residual.py index 6054d25..a2ae61f 100644 --- a/grace_dl/torch/memory/residual.py +++ b/grace_dl/torch/memory/residual.py @@ -1,4 +1,4 @@ -from grace.grace_dl.torch import Memory +from grace_dl.torch import Memory class ResidualMemory(Memory): @@ -9,15 +9,12 @@ def __init__(self, beta=1.0, gamma=1.0): def compensate(self, tensor, name): """Update the tensor with the residuals.""" - # print("inside residuals compensate") if name in self.residuals: tensor = self.beta * self.residuals[name] + self.gamma * tensor - # print("inside residuals",self.residuals[name] ) return tensor def update(self, tensor, name, compressor, tensor_compressed, ctx): """Update the residuals.""" tensor_decompressed = compressor.decompress(tensor_compressed, ctx) residual = tensor - tensor_decompressed - # print("residual update",residual) self.residuals[name] = residual diff --git a/patch_files/horovod/_keras/__init__.py b/patch_files/horovod/_keras/__init__.py new file mode 100644 index 0000000..a24222e --- /dev/null +++ b/patch_files/horovod/_keras/__init__.py @@ -0,0 +1,183 @@ +# Copyright 2017 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from distutils.version import LooseVersion + +import horovod.tensorflow as hvd +import tensorflow as tf +from horovod.tensorflow.gradient_aggregation import LocalGradientAggregationHelper +from horovod.tensorflow.gradient_aggregation_eager import LocalGradientAggregationHelperEager +from horovod.tensorflow.mpi_ops import rank + + +_PRE_TF_2_4_0 = LooseVersion(tf.__version__) < LooseVersion('2.4.0') + + +def create_distributed_optimizer(keras, optimizer, name, device_dense, device_sparse, + grace, compression, sparse_as_dense, gradient_predivide_factor, + op, backward_passes_per_step=1, + average_aggregated_gradients=False, + num_groups=0): + class _DistributedOptimizer(keras.optimizers.Optimizer): + _HAS_AGGREGATE_GRAD = True + + def __init__(self, **kwargs): + self._name = name or "Distributed%s" % self.__class__.__base__.__name__ + self._aggregated_gradients = False + + self._allreduce_grads = hvd._make_allreduce_grads_fn( + self._name, + device_dense, + device_sparse, + grace, + compression, + sparse_as_dense, + op, + gradient_predivide_factor, + num_groups) + + self._agg_helper = None + if backward_passes_per_step > 1: + if hvd._executing_eagerly(): + self._agg_helper = LocalGradientAggregationHelperEager( + backward_passes_per_step=backward_passes_per_step, + allreduce_func=self._allreduce_grads, + sparse_as_dense=sparse_as_dense, + average_aggregated_gradients=average_aggregated_gradients, + ) + else: + self._agg_helper = LocalGradientAggregationHelper( + backward_passes_per_step=backward_passes_per_step, + allreduce_func=self._allreduce_grads, + sparse_as_dense=sparse_as_dense, + average_aggregated_gradients=average_aggregated_gradients, + rank=rank(), + optimizer_type=LocalGradientAggregationHelper._OPTIMIZER_TYPE_KERAS, + ) + + super(self.__class__, self).__init__(**kwargs) + + def get_gradients(self, loss, params): + """ + Compute gradients of all trainable variables. + + See Optimizer.get_gradients() for more info. + + In DistributedOptimizer, get_gradients() is overriden to also + allreduce the gradients before returning them. + """ + grads = super(self.__class__, self).get_gradients(loss, params) + gradients = zip(grads, params) + return self._allreduce(gradients) + + def _aggregate_gradients(self, grads_and_vars): + grads, vars = list(zip(*grads_and_vars)) + aggregated_grads = self._allreduce(grads) + if _PRE_TF_2_4_0: + # Prior to TF 2.4.0, this function was expected to return only a list of + # grads, not a list of (grad, var) tuples. + return aggregated_grads + return list(zip(aggregated_grads, vars)) + + def _allreduce(self, grads): + self._aggregated_gradients = True + + if self._agg_helper: + return self._agg_helper.compute_gradients(tuple(grads)) + else: + return self._allreduce_grads(grads) + + def apply_gradients(self, *args, **kwargs): + if self._agg_helper: + if isinstance(args[0], zip): + # If grad_and_vars are passed in as a zip object + # convert to a list. This is necessary for TF2.4+ + # b/c args[0] is used in both conditional branches + # inside _agg_helper.apply_gradients(). + args = list(args) + args[0] = list(args[0]) + args = tuple(args) + + results = self._agg_helper.apply_gradients( + lambda: super(self.__class__, self).apply_gradients(*args, **kwargs), + self, + *args, + **kwargs, + ) + else: + results = super(self.__class__, self).apply_gradients(*args, **kwargs) + + if not self._aggregated_gradients: + raise Exception('`apply_gradients()` was called without a call to ' + '`get_gradients()` or `_aggregate_gradients`. If you\'re ' + 'using TensorFlow 2.0, please specify ' + '`experimental_run_tf_function=False` in `compile()`.') + + return results + + # We dynamically create a new class that inherits from the optimizer that was passed in. + # The goal is to override get_gradients() method with an allreduce implementation. + # This class will have the same name as the optimizer it's wrapping, so that the saved + # model could be easily restored without Horovod. + cls = type(optimizer.__class__.__name__, (optimizer.__class__,), + dict(_DistributedOptimizer.__dict__)) + + return cls.from_config(optimizer.get_config()) + + +def _eval(backend, op_or_result): + if hvd._executing_eagerly(): + return op_or_result + else: + return backend.get_session().run(op_or_result) + + +if hasattr(hvd, 'broadcast_global_variables'): + def broadcast_global_variables(backend, root_rank): + return _eval(backend, hvd.broadcast_global_variables(root_rank)) + + +def allreduce(backend, value, name, average, prescale_factor, postscale_factor, op, grace, compression): + return _eval(backend, hvd.allreduce(tf.constant(value, name=name), average=average, + prescale_factor=prescale_factor, + postscale_factor=postscale_factor, + op=op, grace=grace, compression=compression)) + + +def allgather(backend, value, name): + return _eval(backend, hvd.allgather(tf.constant(value, name=name))) + + +def broadcast(backend, value, root_rank, name): + return _eval(backend, hvd.broadcast(tf.constant(value, name=name), root_rank)) + + +def load_model(keras, wrap_optimizer, optimizer_modules, filepath, custom_optimizers, custom_objects): + horovod_objects = { + subclass.__name__.lower(): wrap_optimizer(subclass) + for subclass in keras.optimizers.Optimizer.__subclasses__() + if subclass.__module__ in optimizer_modules + } + + if custom_optimizers is not None: + horovod_objects.update({ + cls.__name__: wrap_optimizer(cls) + for cls in custom_optimizers + }) + + if custom_objects is not None: + horovod_objects.update(custom_objects) + + return keras.models.load_model(filepath, custom_objects=horovod_objects) diff --git a/patch_files/horovod/keras/__init__.py b/patch_files/horovod/keras/__init__.py new file mode 100644 index 0000000..9cbcafe --- /dev/null +++ b/patch_files/horovod/keras/__init__.py @@ -0,0 +1,183 @@ +# Copyright 2017 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import keras +import keras.backend as K + +from horovod.tensorflow import init +from horovod.tensorflow import shutdown +from horovod.tensorflow import size +from horovod.tensorflow import is_initialized, start_timeline, stop_timeline +from horovod.tensorflow import local_size +from horovod.tensorflow import rank +from horovod.tensorflow import local_rank +from horovod.tensorflow import mpi_threads_supported, mpi_enabled, mpi_built +from horovod.tensorflow import gloo_enabled, gloo_built +from horovod.tensorflow import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built +from horovod.tensorflow import Average, Compression, Sum + + +from horovod.keras import callbacks, elastic +import horovod._keras as _impl + + +def DistributedOptimizer(optimizer, name=None, + device_dense='', device_sparse='', + grace=None, + compression=Compression.none, + sparse_as_dense=False, + gradient_predivide_factor=1.0, + op=Average, + num_groups=0): + """ + An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to + average gradient values before applying gradients to model weights. + + Args: + optimizer: Optimizer to use for computing gradients and applying updates. + name: Optional name prefix for the operations created when applying + gradients. Defaults to "Distributed" followed by the provided + optimizer type. + device_dense: Device to be used for dense tensors. Uses GPU by default + if Horovod was build with HOROVOD_GPU_OPERATIONS. + device_sparse: Device to be used for sparse tensors. Uses GPU by default + if Horovod was build with HOROVOD_GPU_OPERATIONS. + compression: Compression algorithm used to reduce the amount of data + sent and received by each worker node. Defaults to not + using compression. + sparse_as_dense: Treat all sparse gradients as dense tensors. This can + help improve performance and memory utilization if + the original sparse gradient has high density. + Defaults to false. + gradient_predivide_factor: gradient_predivide_factor splits the averaging + before and after the sum. Gradients are scaled by + 1.0 / gradient_predivide_factor before the sum and + gradient_predivide_factor / size after the sum. + op: The reduction operation to use when combining gradients across + different ranks. Defaults to Average. + num_groups: Number of groups to assign gradient allreduce ops to for explicit + grouping. Defaults to no explicit groups. + """ + if gradient_predivide_factor != 1.0 and rocm_built(): + raise ValueError('gradient_predivide_factor not supported yet with ROCm') + + if op != Average and op != Sum: + raise ValueError('op currently only supports Average and Sum') + + return _impl.create_distributed_optimizer( + keras=keras, + optimizer=optimizer, + name=name, + device_dense=device_dense, + device_sparse=device_sparse, + grace=grace, + compression=compression, + sparse_as_dense=sparse_as_dense, + gradient_predivide_factor=gradient_predivide_factor, + op=op, + num_groups=num_groups, + ) + + +def broadcast_global_variables(root_rank): + """Broadcasts all global variables from root rank to all other processes. + + Arguments: + root_rank: Rank of the process from which global variables will be broadcasted + to all other processes. + """ + return _impl.broadcast_global_variables(K, root_rank) + + +def allreduce(value, name=None, average=True, prescale_factor=1.0, postscale_factor=1.0): + """ + Perform an allreduce on a tensor-compatible value. + + Arguments: + value: A tensor-compatible value to reduce. + The shape of the input must be identical across all ranks. + name: Optional name for the constants created by this operation. + average: If True, computes the average over all ranks. + Otherwise, computes the sum over all ranks. + prescale_factor: Multiplicative factor to scale tensor before allreduce. + postscale_factor: Multiplicative factor to scale tensor after allreduce. + """ + return _impl.allreduce(K, value, name, average, prescale_factor, postscale_factor) + + +def allgather(value, name=None): + """ + Perform an allgather on a tensor-compatible value. + + The concatenation is done on the first dimension, so the input values on the + different processes must have the same rank and shape, except for the first + dimension, which is allowed to be different. + + Arguments: + value: A tensor-compatible value to gather. + name: Optional name prefix for the constants created by this operation. + """ + return _impl.allgather(K, value, name) + + +def broadcast(value, root_rank, name=None): + """ + Perform a broadcast on a tensor-compatible value. + + Arguments: + value: A tensor-compatible value to reduce. + The shape of the input must be identical across all ranks. + root_rank: Rank of the process from which global variables will be + broadcasted to all other processes. + name: Optional name for the constants created by this operation. + """ + return _impl.broadcast(K, value, root_rank, name) + + +def load_model(filepath, custom_optimizers=None, custom_objects=None, grace=None, compression=Compression.none): + """ + Loads a saved Keras model with a Horovod DistributedOptimizer. + + The DistributedOptimizer will wrap the underlying optimizer used to train + the saved model, so that the optimizer state (params and weights) will + be picked up for retraining. + + By default, all optimizers in the module `keras.optimizers` will be loaded + and wrapped without needing to specify any `custom_optimizers` or + `custom_objects`. + + Arguments: + filepath: One of the following: + - string, path to the saved model, or + - h5py.File object from which to load the model + custom_optimizers: Optional list of Optimizer subclasses to support + during loading. + custom_objects: Optional dictionary mapping names (strings) to custom + classes or functions to be considered during deserialization. + compression: Compression algorithm used to reduce the amount of data + sent and received by each worker node. Defaults to not + using compression. + + Returns: + A Keras model instance. + + Raises: + ImportError: If h5py is not available. + ValueError: In case of an invalid savefile. + """ + def wrap_optimizer(cls): + return lambda **kwargs: DistributedOptimizer(cls(**kwargs), grace=grace, compression=compression) + optimizer_modules = {keras.optimizers.Optimizer.__module__} + return _impl.load_model(keras, wrap_optimizer, optimizer_modules, filepath, custom_optimizers, custom_objects) diff --git a/patch_files/horovod/tensorflow/__init__.py b/patch_files/horovod/tensorflow/__init__.py new file mode 100644 index 0000000..b616e28 --- /dev/null +++ b/patch_files/horovod/tensorflow/__init__.py @@ -0,0 +1,761 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# Modifications copyright (C) 2019 Uber Technologies, Inc. +# Modifications copyright Microsoft +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# pylint: disable=g-short-docstring-punctuation + +import os +import warnings + +from horovod.common.util import check_extension, gpu_available, split_list + +check_extension('horovod.tensorflow', 'HOROVOD_WITH_TENSORFLOW', __file__, 'mpi_lib') + +from horovod.tensorflow import elastic +from horovod.tensorflow.compression import Compression +from horovod.tensorflow.functions import allgather_object, broadcast_object, broadcast_object_fn, broadcast_variables +from horovod.tensorflow.mpi_ops import allgather, broadcast, _allreduce, _grouped_allreduce, alltoall +from horovod.tensorflow.mpi_ops import init, shutdown +from horovod.tensorflow.mpi_ops import is_initialized, start_timeline, stop_timeline +from horovod.tensorflow.mpi_ops import size, local_size, rank, local_rank, is_homogeneous +from horovod.tensorflow.mpi_ops import rank_op, local_rank_op, size_op, local_size_op +from horovod.tensorflow.mpi_ops import mpi_threads_supported, mpi_enabled, mpi_built +from horovod.tensorflow.mpi_ops import gloo_enabled, gloo_built +from horovod.tensorflow.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built +from horovod.tensorflow.mpi_ops import Average, Sum, Adasum +from horovod.tensorflow.mpi_ops import handle_average_backwards_compatibility, check_num_rank_power_of_2 +from horovod.tensorflow.util import _executing_eagerly, _make_subgraph, _cache +from horovod.tensorflow.mpi_ops import join +from horovod.tensorflow.sync_batch_norm import SyncBatchNormalization +from horovod.tensorflow.gradient_aggregation import LocalGradientAggregationHelper + +import tensorflow as tf + +# @DEKHTIARJonathan: Do not remove, this fixes issues: +# - https://github.com/tensorflow/tensorflow/issues/38516 +# - https://github.com/tensorflow/tensorflow/issues/39894 +if tf.__version__.startswith('2.2.'): + from tensorflow.python.keras.mixed_precision.experimental import device_compatibility_check + device_compatibility_check.log_device_compatibility_check = lambda policy_name, skip_local: None + + +def allreduce(tensor, average=None, device_dense='', device_sparse='', + grace=None, + compression=Compression.none, op=None, + prescale_factor=1.0, postscale_factor=1.0, + name=None): + """Perform an allreduce on a tf.Tensor or tf.IndexedSlices. + + This function performs a bandwidth-optimal ring allreduce on the input + tensor. If the input is an tf.IndexedSlices, the function instead does an + allgather on the values and the indices, effectively doing an allreduce on + the represented tensor. + + Arguments: + tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce. + The shape of the input must be identical across all ranks. + average: + .. warning:: .. deprecated:: 0.19.0 + + Use `op` instead. Will be removed in v0.21.0. + + device_dense: Device to be used for dense tensors. Uses GPU by default + if Horovod was built with HOROVOD_GPU_OPERATIONS. + device_sparse: Device to be used for sparse tensors. Uses GPU by default + if Horovod was built with HOROVOD_GPU_OPERATIONS. + compression: Compression algorithm used to reduce the amount of data + sent and received by each worker node. Defaults to not + using compression. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average if None is given. + prescale_factor: Multiplicative factor to scale tensor before allreduce. + postscale_factor: Multiplicative factor to scale tensor after allreduce. + name: A name of the allreduce operation + + Returns: + A tensor of the same shape and type as `tensor`, summed across all + processes. + """ + op = handle_average_backwards_compatibility(op, average) + + if rank() == 0: + print(tensor) + if isinstance(tensor, tf.IndexedSlices): + # TODO: Need to fix this to actuall call Adasum + if op == Adasum: + raise NotImplementedError('The Adasum reduction does not currently support sparse tensors. As a ' + 'workaround please pass sparse_as_dense=True to DistributedOptimizer') + with tf.device(device_sparse): + # print("==Debug== this model has a sparse gradient") + # For IndexedSlices, do two allgathers instead of an allreduce. + horovod_size = tf.cast(size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else size(), + dtype=tensor.values.dtype) + values = allgather(tensor.values) + indices = allgather(tensor.indices) + + # To make this operation into an average, divide allgathered values by + # the Horovod size. + new_values = (values / horovod_size) if op == Average else values + return tf.IndexedSlices(new_values, indices, + dense_shape=tensor.dense_shape) + else: + average_in_framework = False + if rocm_built(): + # For ROCm, perform averaging at framework level + average_in_framework = op == Average or op == Adasum + op = Sum if op == Average else op + + with tf.device(device_dense): + horovod_size = tf.cast(size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else size(), + dtype=tensor.dtype) + + if grace and op != Adasum: + # print("==Debug== grace is called") + new_tensor = grace.step(tensor, name=name) + else: + name = None + tensor_compressed, ctx = compression.compress(tensor) + summed_tensor_compressed = _allreduce(tensor_compressed, op=op, + prescale_factor=prescale_factor, + postscale_factor=postscale_factor, + name=name) + summed_tensor = compression.decompress(summed_tensor_compressed, ctx) + if op == Adasum: + if 'CPU' not in tensor.device and gpu_available('tensorflow'): + if nccl_built(): + if not is_homogeneous: + raise NotImplementedError( + 'Running GPU Adasum on heterogeneous cluster is not supported yet.') + elif not check_num_rank_power_of_2(int(size() / local_size())): + raise NotImplementedError( + 'Running GPU Adasum with non-power of 2 nodes is not supported yet.') + if rocm_built(): + horovod_local_size = tf.cast(local_size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else local_size(), + dtype=tensor.dtype) + new_tensor = summed_tensor / horovod_local_size + else: + new_tensor = summed_tensor + else: + warnings.warn('Adasum reduction does not currently support GPU reduction using MPI. Tensors ' + 'are copied to CPU memory instead. To use Adasum for GPU reduction, please ' + 'compile Horovod with HOROVOD_GPU_OPERATIONS=NCCL.') + new_tensor = summed_tensor + else: + if not check_num_rank_power_of_2(size()): + raise NotImplementedError('Running Adasum with non-power of 2 ranks is not supported yet.') + new_tensor = summed_tensor + else: + if rocm_built(): + new_tensor = (summed_tensor / horovod_size) if average_in_framework else summed_tensor + else: + new_tensor = summed_tensor + return new_tensor + +def grouped_allreduce(tensors, average=None, device_dense='', device_sparse='', + grace=None, compression=Compression.none, op=None, + prescale_factor=1.0, postscale_factor=1.0): + if not tensors: + return tensors + + op = handle_average_backwards_compatibility(op, average) + + average_in_framework = False + if rocm_built(): + # For ROCm, perform averaging at framework level + average_in_framework = op == Average or op == Adasum + op = Sum if op == Average else op + + if any(isinstance(t, tf.IndexedSlices) for t in tensors): + # TODO: Need to fix this to actuall call Adasum + if op == Adasum: + raise NotImplementedError('The Adasum reduction does not currently support sparse tensors. As a ' + 'workaround please pass sparse_as_dense=True to DistributedOptimizer') + with tf.device(device_sparse): + new_values = [] + for tensor in tensors: + # For IndexedSlices, do two allgathers instead of an allreduce. + horovod_size = tf.cast(size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else size(), + dtype=tensor.values.dtype) + values = allgather(tensor.values) + indices = allgather(tensor.indices) + + # To make this operation into an average, divide allgathered values by + # the Horovod size. + new_values += (values / horovod_size) if op == Average else values + return [tf.IndexedSlices(x, indices, + dense_shape=t.dense_shape) for x,t in zip(new_values, tensors)] + else: + with tf.device(device_dense): + tensors_compressed, ctxs = zip(*[compression.compress(tensor) for tensor in tensors]) + summed_tensors_compressed = _grouped_allreduce(tensors_compressed, op=op, + prescale_factor=prescale_factor, + postscale_factor=postscale_factor) + summed_tensors = [compression.decompress(t, ctx) for t, ctx in zip(summed_tensors_compressed, ctxs)] + if op == Adasum: + tensor = summed_tensors[0] + if 'CPU' not in tensor.device and gpu_available('tensorflow'): + if nccl_built(): + if not is_homogeneous: + raise NotImplementedError( + 'Running GPU Adasum on heterogeneous cluster is not supported yet.') + elif not check_num_rank_power_of_2(int(size() / local_size())): + raise NotImplementedError( + 'Running GPU Adasum with non-power of 2 nodes is not supported yet.') + if rocm_built(): + new_tensors = [] + for tensor in summed_tensors: + horovod_local_size = tf.cast(local_size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else local_size(), + dtype=tensor.dtype) + new_tensors += tensor / horovod_local_size + else: + new_tensors = summed_tensors + else: + warnings.warn('Adasum reduction does not currently support GPU reduction using MPI. Tensors ' + 'are copied to CPU memory instead. To use Adasum for GPU reduction, please ' + 'compile Horovod with HOROVOD_GPU_OPERATIONS=NCCL.') + new_tensors = summed_tensors + else: + if not check_num_rank_power_of_2(size()): + raise NotImplementedError('Running Adasum with non-power of 2 ranks is not supported yet.') + new_tensors = summed_tensors + else: + if rocm_built(): + new_tensors = [] + for tensor in summed_tensors: + horovod_size = tf.cast(size_op() if int(os.environ.get("HOROVOD_ELASTIC", 0)) else size(), + dtype=tensor.dtype) + new_tensors += (tensor / horovod_size) if average_in_framework else tensor + else: + new_tensors = summed_tensors + return new_tensors + +def _allreduce_cond(tensor, *args, **kwargs): + def allreduce_fn(): + return allreduce(tensor, *args, **kwargs) + + def id_fn(): + return tensor + + return tf.cond((size_op() > 1) if int(os.environ.get("HOROVOD_ELASTIC", 0)) else tf.convert_to_tensor(size() > 1), + allreduce_fn, id_fn) + +def _grouped_allreduce_cond(tensors, *args, **kwargs): + def allreduce_fn(): + return grouped_allreduce(tensors, *args, **kwargs) + + def id_fn(): + return tensors + + return tf.cond((size_op() > 1) if int(os.environ.get("HOROVOD_ELASTIC", 0)) else tf.convert_to_tensor(size() > 1), + allreduce_fn, id_fn) + + +try: + _global_variables = tf.compat.v1.global_variables +except AttributeError: + try: + _global_variables = tf.global_variables + except AttributeError: + _global_variables = None + +if _global_variables is not None: + def broadcast_global_variables(root_rank): + """Broadcasts all global variables from root rank to all other processes. + + **NOTE:** deprecated in TensorFlow 2.0. + + Arguments: + root_rank: rank of the process from which global variables will be broadcasted + to all other processes. + """ + if _executing_eagerly(): + raise RuntimeError( + "hvd.broadcast_global_variables() does not support eager execution. " + "Please use `hvd.broadcast_variables()` instead." + ) + + return broadcast_variables(_global_variables(), root_rank) + +try: + _get_default_graph = tf.compat.v1.get_default_graph +except AttributeError: + try: + _get_default_graph = tf.get_default_graph + except AttributeError: + _get_default_graph = None + +try: + _SessionRunHook = tf.estimator.SessionRunHook +except AttributeError: + try: + _SessionRunHook = tf.train.SessionRunHook + except AttributeError: + _SessionRunHook = None + +if _SessionRunHook is not None and _get_default_graph is not None: + class BroadcastGlobalVariablesHook(_SessionRunHook): + """ + SessionRunHook that will broadcast all global variables from root rank + to all other processes during initialization. + + This is necessary to ensure consistent initialization of all workers when + training is started with random weights or restored from a checkpoint. + + **NOTE:** deprecated in TensorFlow 2.0. + """ + + def __init__(self, root_rank, device=''): + """Construct a new BroadcastGlobalVariablesHook that will broadcast all + global variables from root rank to all other processes during initialization. + + Args: + root_rank: + Rank that will send data, other ranks will receive data. + device: + Device to be used for broadcasting. Uses GPU by default + if Horovod was built with HOROVOD_GPU_OPERATIONS. + """ + super(BroadcastGlobalVariablesHook, self).__init__() + self.root_rank = root_rank + self.bcast_op = None + self.device = device + + def begin(self): + if not self.bcast_op or self.bcast_op.graph != _get_default_graph(): + with tf.device(self.device): + self.bcast_op = broadcast_global_variables(self.root_rank) + + def after_create_session(self, session, coord): + session.run(self.bcast_op) + + +@_cache +def _make_allreduce_grads_fn(name, device_dense, device_sparse, + grace, compression, sparse_as_dense, op, gradient_predivide_factor, + num_groups): + if op == Average: + # Split average operation across pre/postscale factors + # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. + prescale_factor = 1.0 / gradient_predivide_factor + postscale_factor = gradient_predivide_factor + else: + prescale_factor = 1.0 + postscale_factor = 1.0 + + def allreduce_grads(gradients): + grads, vars = zip(*gradients) + with tf.name_scope(name + "_Allreduce"): + if sparse_as_dense: + grads = [tf.convert_to_tensor(grad) + if grad is not None and isinstance(grad, tf.IndexedSlices) + else grad for grad in grads] + + if num_groups > 0: + grads_clean = [grad for grad in grads if grad is not None] + grads_split = split_list(grads_clean, num_groups) + + reduce_ops = [] + for group in grads_split: + reduce_ops += _grouped_allreduce_cond(group, + device_dense=device_dense, + device_sparse=device_sparse, + grace=grace, + compression=compression, + op=op, + prescale_factor=prescale_factor, + postscale_factor=postscale_factor) + return reduce_ops + + return [_allreduce_cond(grad, + device_dense=device_dense, + device_sparse=device_sparse, + grace=grace, + compression=compression, + op=op, + prescale_factor=prescale_factor, + postscale_factor=postscale_factor, + name=var.name) + if grad is not None else grad + for (grad,var) in zip(grads, vars)] + + if _executing_eagerly(): + return _make_subgraph(allreduce_grads) + else: + return allreduce_grads + + +try: + # TensorFlow 2.x + _LegacyOptimizer = tf.compat.v1.train.Optimizer +except AttributeError: + try: + # TensorFlow 1.x + _LegacyOptimizer = tf.train.Optimizer + except AttributeError: + # Future TensorFlow versions + _LegacyOptimizer = None + +if _LegacyOptimizer is not None: + class _DistributedOptimizer(_LegacyOptimizer): + """An optimizer that wraps another tf.Optimizer, using an allreduce to + combine gradient values before applying gradients to model weights.""" + + def __init__(self, optimizer, name=None, use_locking=False, device_dense='', + device_sparse='', grace=None, compression=Compression.none, + sparse_as_dense=False, op=Average, gradient_predivide_factor=1.0, + backward_passes_per_step=1, average_aggregated_gradients=False, + num_groups=0): + if name is None: + name = "Distributed{}".format(type(optimizer).__name__) + super(_DistributedOptimizer, self).__init__(name=name, use_locking=use_locking) + + self._optimizer = optimizer + self._allreduce_grads = _make_allreduce_grads_fn( + name, device_dense, device_sparse, grace, compression, sparse_as_dense, op, + gradient_predivide_factor, num_groups) + + self._agg_helper = None + if backward_passes_per_step > 1: + if _executing_eagerly(): + raise ValueError( + "backward_passes_per_step > 1 is not yet supported " + "for _LegacyOptimizer with eager execution." + ) + + self._agg_helper = LocalGradientAggregationHelper( + backward_passes_per_step=backward_passes_per_step, + allreduce_func=self._allreduce_grads, + sparse_as_dense=sparse_as_dense, + average_aggregated_gradients=average_aggregated_gradients, + rank=rank(), + optimizer_type=LocalGradientAggregationHelper._OPTIMIZER_TYPE_LEGACY, + ) + + def compute_gradients(self, *args, **kwargs): + """Compute gradients of all trainable variables. + + See Optimizer.compute_gradients() for more info. + + In DistributedOptimizer, compute_gradients() is overriden to also + allreduce the gradients before returning them. + """ + gradients = self._optimizer.compute_gradients(*args, **kwargs) + grads, vars = zip(*gradients) + if self._agg_helper: + avg_grads = self._agg_helper.compute_gradients(grads) + else: + avg_grads = self._allreduce_grads(gradients) + return list(zip(avg_grads, vars)) + + def apply_gradients(self, *args, **kwargs): + """Calls this same method on the underlying optimizer.""" + if self._agg_helper: + return self._agg_helper.apply_gradients( + lambda: self._optimizer.apply_gradients(*args, **kwargs), + self._optimizer, + *args, + **kwargs, + ) + + return self._optimizer.apply_gradients(*args, **kwargs) + + def get_slot(self, *args, **kwargs): + """Calls this same method on the underlying optimizer.""" + return self._optimizer.get_slot(*args, **kwargs) + + def get_slot_names(self, *args, **kwargs): + """Calls this same method on the underlying optimizer.""" + return self._optimizer.get_slot_names(*args, **kwargs) + + def variables(self, *args, **kwargs): + """Calls this same method on the underlying optimizer.""" + return self._optimizer.variables(*args, **kwargs) + + class _DistributedAdasumOptimizer(_LegacyOptimizer): + """An optimizer that wraps another tf.Optimizer, using an allreduce to + combine model deltas after applying gradients to model weights.""" + + def __init__(self, optimizer, name=None, use_locking=False, device_dense='', + device_sparse='', grace=None, compression=Compression.none, backward_passes_per_step=1): + if name is None: + name = "DistributedDelta{}".format(type(optimizer).__name__) + super(_DistributedAdasumOptimizer, self).__init__(name=name, use_locking=use_locking) + + self._optimizer = optimizer + self._name = name + self._device_dense = device_dense + self._device_sparse = device_sparse + self._grace = grace + self._compression = compression + self._backward_passes_per_step = backward_passes_per_step + + def _prepare(self): + self._step_count = tf.get_variable( + name="step_count", shape=[], dtype=tf.int64, trainable=False, + initializer=tf.zeros_initializer) + self._is_first_step = tf.cast(tf.math.equal(self._step_count, 0), dtype=tf.bool) + self._is_comm_step = tf.cast(tf.math.equal(self._step_count % self._backward_passes_per_step, self._backward_passes_per_step - 1), dtype=tf.bool) + + def _apply_shared(self, var, get_update_op): + start_slot = self._get_or_make_slot(var, "delta_start") + + # initialize start on the first step + assign_op = tf.cond(self._is_first_step, + lambda: start_slot.assign(var, use_locking=self.use_locking).op, + tf.no_op) + + with tf.control_dependencies([assign_op]): + update_op = get_update_op() + with tf.control_dependencies([update_op]): + def update(): + # delta = var - start + local_delta = var.assign_sub(start_slot, use_locking=self.use_locking) # reuse var's memory + # delta = allreduce (delta) + global_delta = allreduce(local_delta, + device_dense=self._device_dense, + device_sparse=self._device_sparse, + grace=self._grace, + compression=self._compression, + op=Adasum) + # start = start + delta + new_start = start_slot.assign_add(global_delta, use_locking=self.use_locking) + # var = start + return var.assign(new_start, use_locking=self.use_locking).op + + # if its a communication step, then apply logic above + # if its not a communication step then just have the underlying + # optimizer update the model parameters according to its logic + return tf.cond(self._is_comm_step, update, tf.no_op) + + def _apply_dense(self, grad, var): + return self._apply_shared(var, lambda: self._optimizer._apply_dense(grad, var)) + + def _resource_apply_dense(self, grad, handle): + return self._apply_shared(handle, lambda: self._optimizer._resource_apply_dense(grad, handle)) + + def _apply_sparse(self, grad, var): + return self._apply_shared(var, lambda: self._optimizer._apply_sparse(grad, var)) + + def _resource_apply_sparse(self, grad, handle, indices): + return self._apply_shared(handle, lambda: self._optimizer._resource_apply_sparse(grad, handle, indices)) + + def _finish(self, update_ops, name_scope): + with tf.control_dependencies(update_ops): + return tf.assign_add(self._step_count, 1) + + def compute_gradients(self, *args, **kwargs): + """Compute gradients of all trainable variables. + See Optimizer.compute_gradients() for more info. + """ + return self._optimizer.compute_gradients(*args, **kwargs) + + def apply_gradients(self, *args, **kwargs): + """Calls this same method on the underlying optimizer.""" + return self._optimizer.apply_gradients(*args, **kwargs) + + def get_slot(self, var, name): + """Calls this same method on the underlying optimizer.""" + tmp = super(_DistributedAdasumOptimizer, self).get_slot(var, name) + if tmp is not None: + return tmp + return self._optimizer.get_slot(var, name) + + def get_slot_names(self): + """Appends local slot names to those of the underlying optimizer.""" + return super(_DistributedAdasumOptimizer, self).get_slot_names() +\ + self._optimizer.get_slot_names() + + def variables(self, *args, **kwargs): + """Calls this same method on the underlying optimizer.""" + return self._optimizer.variables(*args, **kwargs) + + +def DistributedOptimizer(optimizer, name=None, use_locking=False, device_dense='', + device_sparse='', grace=None, compression=Compression.none, + sparse_as_dense=False, backward_passes_per_step=1, + op=Average, gradient_predivide_factor=1.0, + average_aggregated_gradients=False, + num_groups=0): + """Construct a new DistributedOptimizer, which uses another optimizer + under the hood for computing single-process gradient values and + applying gradient updates after the gradient values have been combined + across all the Horovod ranks. + + Args: + optimizer: + Optimizer to use for computing gradients and applying updates. + name: + Optional name prefix for the operations created when applying + gradients. Defaults to "Distributed" followed by the provided + optimizer type. + use_locking: + Whether to use locking when updating variables. + See Optimizer.__init__ for more info. + device_dense: + Device to be used for dense tensors. Uses GPU by default + if Horovod was built with HOROVOD_GPU_OPERATIONS. + device_sparse: + Device to be used for sparse tensors. Uses GPU by default + if Horovod was built with HOROVOD_GPU_OPERATIONS. + compression: + Compression algorithm used during allreduce to reduce the amount + of data sent during each parameter update step. Defaults to + not using compression. + sparse_as_dense: + Treat all sparse gradients as dense tensors. This can help improve + performance and memory utilization if the original sparse gradient + has high density. Defaults to false. + backward_passes_per_step: + Number of backward passes to perform before calling hvd.allreduce. + This allows accumulating updates over multiple mini-batches before + reducing and applying them. + op: + The reduction operation to use when combining gradients across + different ranks. + gradient_predivide_factor: + If op == Average, gradient_predivide_factor splits the averaging + before and after the sum. Gradients are scaled by + 1.0 / gradient_predivide_factor before the sum and + gradient_predivide_factor / size after the sum. + average_aggregated_gradients: + Whether to average the aggregated gradients that have been accumulated + over multiple mini-batches. If true divides gradients updates by + backward_passes_per_step. Only applicable for backward_passes_per_step > 1. + num_groups: + Number of groups to assign gradient allreduce ops to for explicit + grouping. Defaults to no explicit groups. + """ + if gradient_predivide_factor != 1.0: + if rocm_built(): + raise ValueError('gradient_predivide_factor not supported yet with ROCm') + if op != Average: + raise ValueError('gradient_predivide_factor not supported with op != Average') + + if op == Adasum and average_aggregated_gradients: + raise ValueError('Adasum does not support average_aggregated_gradients == True') + + if isinstance(optimizer, _LegacyOptimizer): + if op == Adasum: + return _DistributedAdasumOptimizer(optimizer, name, use_locking, device_dense, + device_sparse, grace, compression, backward_passes_per_step) + + return _DistributedOptimizer( + optimizer=optimizer, + name=name, + use_locking=use_locking, + device_dense=device_dense, + device_sparse=device_sparse, + grace=grace, + compression=compression, + sparse_as_dense=sparse_as_dense, + op=op, + gradient_predivide_factor=gradient_predivide_factor, + backward_passes_per_step=backward_passes_per_step, + average_aggregated_gradients=average_aggregated_gradients, + num_groups=num_groups + ) + elif isinstance(optimizer, tf.keras.optimizers.Optimizer): + if op == Adasum: + raise ValueError('op == Adasum is not supported yet with Keras') + + import horovod.tensorflow.keras as hvd_k + return hvd_k.DistributedOptimizer( + optimizer=optimizer, + name=name, + device_dense=device_dense, + device_sparse=device_sparse, + grace=grace, + compression=compression, + sparse_as_dense=sparse_as_dense, + gradient_predivide_factor=gradient_predivide_factor, + backward_passes_per_step=backward_passes_per_step, + average_aggregated_gradients=average_aggregated_gradients, + ) + else: + raise ValueError('Provided optimizer doesn\'t inherit from either legacy ' + 'TensorFlow or Keras optimizer: %s' % optimizer) + + +if hasattr(tf, 'GradientTape'): + class _DistributedGradientTape(tf.GradientTape): + def __init__(self, tape, device_dense, device_sparse, grace, compression, sparse_as_dense, op, + gradient_predivide_factor, num_groups, persistent=False, watch_accessed_variables=True): + if hasattr(tape, '_watch_accessed_variables'): + super(self.__class__, self).__init__(persistent, watch_accessed_variables) + else: + super(self.__class__, self).__init__(persistent) + + self._tape = tape + self._allreduce_grads = _make_allreduce_grads_fn( + 'DistributedGradientTape', device_dense, device_sparse, grace, compression, + sparse_as_dense, op, gradient_predivide_factor, num_groups) + + def gradient(self, target, sources, output_gradients=None): + gradients = super(self.__class__, self).gradient(target, sources, output_gradients) + return self._allreduce_grads(gradients) + + + def DistributedGradientTape(gradtape, device_dense='', device_sparse='', + grace=None, compression=Compression.none, sparse_as_dense=False, + op=Average, gradient_predivide_factor=1.0, + num_groups=0): + """A tape that wraps another tf.GradientTape, using an allreduce to + combine gradient values before applying gradients to model weights. + + Args: + gradtape: + GradientTape to use for computing gradients and applying updates. + device_dense: + Device to be used for dense tensors. Uses GPU by default + if Horovod was built with HOROVOD_GPU_OPERATIONS. + device_sparse: + Device to be used for sparse tensors. Uses GPU by default + if Horovod was built with HOROVOD_GPU_OPERATIONS. + compression: + Compression algorithm used during allreduce to reduce the amount + of data sent during each parameter update step. Defaults to + not using compression. + sparse_as_dense: + Treat all sparse gradients as dense tensors. This can help improve + performance and memory utilization if the original sparse gradient + has high density. Defaults to false. + op: + The reduction operation to use when combining gradients across + different ranks. + gradient_predivide_factor: + If op == Average, gradient_predivide_factor splits the averaging + before and after the sum. Gradients are scaled by + 1.0 / gradient_predivide_factor before the sum and + gradient_predivide_factor / size after the sum. + num_groups: + Number of groups to assign gradient allreduce ops to for explicit + grouping. Defaults to no explicit groups. + """ + if gradient_predivide_factor != 1.0: + if rocm_built(): + raise ValueError('gradient_predivide_factor not supported yet with ROCm') + if op != Average: + raise ValueError('gradient_predivide_factor not supported with op != Average') + + cls = type(gradtape.__class__.__name__, (gradtape.__class__,), + dict(_DistributedGradientTape.__dict__)) + if hasattr(gradtape, '_watch_accessed_variables'): + return cls(gradtape._tape, device_dense, device_sparse, grace, compression, + sparse_as_dense, op, gradient_predivide_factor, num_groups, + gradtape._persistent, gradtape._watch_accessed_variables) + else: + return cls(gradtape._tape, device_dense, device_sparse, grace, compression, + sparse_as_dense, op, gradient_predivide_factor, num_groups, + gradtape._persistent) \ No newline at end of file diff --git a/patch_files/horovod/tensorflow/keras/__init__.py b/patch_files/horovod/tensorflow/keras/__init__.py new file mode 100644 index 0000000..31bfac3 --- /dev/null +++ b/patch_files/horovod/tensorflow/keras/__init__.py @@ -0,0 +1,225 @@ +# Copyright 2018 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import inspect + +import tensorflow as tf + +from tensorflow import keras +from tensorflow.python.keras import backend as K + +from horovod.tensorflow import init +from horovod.tensorflow import shutdown +from horovod.tensorflow import is_initialized, start_timeline, stop_timeline +from horovod.tensorflow import size +from horovod.tensorflow import local_size +from horovod.tensorflow import rank +from horovod.tensorflow import local_rank +from horovod.tensorflow import mpi_threads_supported, mpi_enabled, mpi_built +from horovod.tensorflow import gloo_enabled, gloo_built +from horovod.tensorflow import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built +from horovod.tensorflow import Average, Compression, Sum + +import horovod._keras as _impl +from horovod.tensorflow.keras import callbacks, elastic + + +try: + # In later versions of TensorFlow, optimizers are spread across multiple modules. This set is used to distinguish + # stock optimizers that come with tf.keras from custom optimizers that may need to be wrapped specially. + _OPTIMIZER_MODULES = set([obj.__module__ for name, obj in inspect.getmembers(tf.keras.optimizers) + if isinstance(obj, type(tf.keras.optimizers.Optimizer))]) +except: + _OPTIMIZER_MODULES = set() + + +def DistributedOptimizer(optimizer, name=None, + device_dense='', device_sparse='', + grace=None, + compression=Compression.none, + sparse_as_dense=False, + gradient_predivide_factor=1.0, + op=Average, + backward_passes_per_step=1, + average_aggregated_gradients=False): + """ + An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to + average gradient values before applying gradients to model weights. + + Args: + optimizer: Optimizer to use for computing gradients and applying updates. + name: Optional name prefix for the operations created when applying + gradients. Defaults to "Distributed" followed by the provided + optimizer type. + device_dense: Device to be used for dense tensors. Uses GPU by default + if Horovod was build with HOROVOD_GPU_OPERATIONS. + device_sparse: Device to be used for sparse tensors. Uses GPU by default + if Horovod was build with HOROVOD_GPU_OPERATIONS. + compression: Compression algorithm used to reduce the amount of data + sent and received by each worker node. Defaults to not + using compression. + sparse_as_dense: Treat all sparse gradients as dense tensors. This can + help improve performance and memory utilization if + the original sparse gradient has high density. + Defaults to false. + gradient_predivide_factor: gradient_predivide_factor splits the averaging + before and after the sum. Gradients are scaled by + 1.0 / gradient_predivide_factor before the sum and + gradient_predivide_factor / size after the sum. + op: The reduction operation to use when combining gradients across + different ranks. Defaults to Average. + backward_passes_per_step: Number of backward passes to perform before calling + hvd.allreduce. This allows accumulating updates over + multiple mini-batches before reducing and applying them. + average_aggregated_gradients: Whether to average the aggregated gradients that + have been accumulated over multiple mini-batches. + If true divides gradient updates by + backward_passes_per_step. + Only applicable for backward_passes_per_step > 1. + """ + if gradient_predivide_factor != 1.0 and rocm_built(): + raise ValueError('gradient_predivide_factor not supported yet with ROCm') + + if op != Average and op != Sum: + raise ValueError('op currently only supports Average and Sum') + + return _impl.create_distributed_optimizer( + keras=keras, + optimizer=optimizer, + name=name, + device_dense=device_dense, + device_sparse=device_sparse, + grace=grace, + compression=compression, + sparse_as_dense=sparse_as_dense, + gradient_predivide_factor=gradient_predivide_factor, + op=op, + backward_passes_per_step=backward_passes_per_step, + average_aggregated_gradients=average_aggregated_gradients, + ) + + +def broadcast_global_variables(root_rank): + """Broadcasts all global variables from root rank to all other processes. + + Arguments: + root_rank: Rank of the process from which global variables will be broadcasted + to all other processes. + """ + return _impl.broadcast_global_variables(K, root_rank) + + +def allreduce(value, name=None, average=True, + prescale_factor=1.0, + postscale_factor=1.0, + op=None, + grace=None, + compression=Compression.none): + """ + Perform an allreduce on a tensor-compatible value. + + Arguments: + value: A tensor-compatible value to reduce. + The shape of the input must be identical across all ranks. + name: Optional name for the constants created by this operation. + average: + .. warning:: .. deprecated:: 0.19.0 + + Use `op` instead. Will be removed in v0.21.0. + + prescale_factor: Multiplicative factor to scale tensor before allreduce. + postscale_factor: Multiplicative factor to scale tensor after allreduce. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average if None is given. + compression: Compression algorithm used to reduce the amount of data + sent and received by each worker node. Defaults to not + using compression. + """ + return _impl.allreduce( + backend=K, + value=value, + name=name, + average=average, + prescale_factor=prescale_factor, + postscale_factor=postscale_factor, + op=op, + grace=grace, + compression=compression) + + +def allgather(value, name=None): + """ + Perform an allgather on a tensor-compatible value. + + The concatenation is done on the first dimension, so the input values on the + different processes must have the same rank and shape, except for the first + dimension, which is allowed to be different. + + Arguments: + value: A tensor-compatible value to gather. + name: Optional name prefix for the constants created by this operation. + """ + return _impl.allgather(K, value, name) + + +def broadcast(value, root_rank, name=None): + """ + Perform a broadcast on a tensor-compatible value. + + Arguments: + value: A tensor-compatible value to reduce. + The shape of the input must be identical across all ranks. + root_rank: Rank of the process from which global variables will be + broadcasted to all other processes. + name: Optional name for the constants created by this operation. + """ + return _impl.broadcast(K, value, root_rank, name) + + +def load_model(filepath, custom_optimizers=None, custom_objects=None, grace=None, compression=Compression.none): + """ + Loads a saved Keras model with a Horovod DistributedOptimizer. + + The DistributedOptimizer will wrap the underlying optimizer used to train + the saved model, so that the optimizer state (params and weights) will + be picked up for retraining. + + By default, all optimizers in the module `keras.optimizers` will be loaded + and wrapped without needing to specify any `custom_optimizers` or + `custom_objects`. + + Arguments: + filepath: One of the following: + - string, path to the saved model, or + - h5py.File object from which to load the model + custom_optimizers: Optional list of Optimizer subclasses to support + during loading. + custom_objects: Optional dictionary mapping names (strings) to custom + classes or functions to be considered during deserialization. + compression: Compression algorithm used to reduce the amount of data + sent and received by each worker node. Defaults to not + using compression. + + Returns: + A Keras model instance. + + Raises: + ImportError: If h5py is not available. + ValueError: In case of an invalid savefile. + """ + def wrap_optimizer(cls): + return lambda **kwargs: DistributedOptimizer(cls(**kwargs), grace=grace, compression=compression) + return _impl.load_model(keras, wrap_optimizer, _OPTIMIZER_MODULES, filepath, custom_optimizers, custom_objects) + diff --git a/patch_files/horovod/torch/__init__.py b/patch_files/horovod/torch/__init__.py index 7ad827e..d17c61d 100644 --- a/patch_files/horovod/torch/__init__.py +++ b/patch_files/horovod/torch/__init__.py @@ -16,44 +16,33 @@ from horovod.common.util import check_extension -_MPI_LIB_AVAILABLE = True try: check_extension('horovod.torch', 'HOROVOD_WITH_PYTORCH', __file__, 'mpi_lib_v2') -except Exception as e: - # MPI libs are missing, but python applications are still available. - print(e) - print("Warning! MPI libs are missing, but python applications are still available.") - _MPI_LIB_AVAILABLE = False +except: + check_extension('horovod.torch', 'HOROVOD_WITH_PYTORCH', + __file__, 'mpi_lib', '_mpi_lib') + +from horovod.torch import elastic +from horovod.torch.compression import Compression +from horovod.torch.functions import allgather_object, broadcast_object, broadcast_optimizer_state, broadcast_parameters +from horovod.torch.mpi_ops import allreduce, allreduce_async, allreduce_, allreduce_async_ +from horovod.torch.mpi_ops import grouped_allreduce, grouped_allreduce_async, grouped_allreduce_, grouped_allreduce_async_ +from horovod.torch.mpi_ops import allgather, allgather_async +from horovod.torch.mpi_ops import broadcast, broadcast_async, broadcast_, broadcast_async_ +from horovod.torch.mpi_ops import alltoall, alltoall_async +from horovod.torch.mpi_ops import join +from horovod.torch.mpi_ops import poll, synchronize +from horovod.torch.mpi_ops import init, shutdown +from horovod.torch.mpi_ops import is_initialized, start_timeline, stop_timeline +from horovod.torch.mpi_ops import size, local_size, rank, local_rank +from horovod.torch.mpi_ops import mpi_threads_supported, mpi_enabled, mpi_built +from horovod.torch.mpi_ops import gloo_enabled, gloo_built +from horovod.torch.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built +from horovod.torch.mpi_ops import Average, Sum, Adasum +from horovod.torch.optimizer import DistributedOptimizer +from horovod.torch.sync_batch_norm import SyncBatchNorm -# only import following function when mpi is available. -if _MPI_LIB_AVAILABLE: - from horovod.torch import elastic - from horovod.torch.compression import Compression - from horovod.torch.functions import allgather_object, broadcast_object, broadcast_optimizer_state, broadcast_parameters - from horovod.torch.mpi_ops import allreduce, allreduce_async, allreduce_, allreduce_async_ - from horovod.torch.mpi_ops import grouped_allreduce, grouped_allreduce_async, grouped_allreduce_, grouped_allreduce_async_ - from horovod.torch.mpi_ops import sparse_allreduce_async - from horovod.torch.mpi_ops import allgather, allgather_async - from horovod.torch.mpi_ops import grouped_allgather, grouped_allgather_async - from horovod.torch.mpi_ops import broadcast, broadcast_async, broadcast_, broadcast_async_ - from horovod.torch.mpi_ops import alltoall, alltoall_async - from horovod.torch.mpi_ops import reducescatter, reducescatter_async - from horovod.torch.mpi_ops import grouped_reducescatter, grouped_reducescatter_async - from horovod.torch.mpi_ops import join - from horovod.torch.mpi_ops import barrier - from horovod.torch.mpi_ops import poll, synchronize - from horovod.torch.mpi_ops import init, shutdown - from horovod.torch.mpi_ops import is_initialized, start_timeline, stop_timeline - from horovod.torch.mpi_ops import size, local_size, cross_size, rank, local_rank, cross_rank - from horovod.torch.mpi_ops import mpi_threads_supported, mpi_enabled, mpi_built - from horovod.torch.mpi_ops import gloo_enabled, gloo_built - from horovod.torch.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built - from horovod.torch.mpi_ops import ProcessSet, global_process_set, add_process_set, remove_process_set - from horovod.torch.mpi_ops import Average, Sum, Adasum, Min, Max, Product - from horovod.torch.mpi_ops import HorovodInternalError - from horovod.torch.optimizer import DistributedOptimizer - from horovod.torch.sync_batch_norm import SyncBatchNorm # Please run this function in a subprocess def _check_has_gpu(): diff --git a/patch_files/horovod/torch/mpi_ops.py b/patch_files/horovod/torch/mpi_ops.py index 8ee3474..a4c16f7 100644 --- a/patch_files/horovod/torch/mpi_ops.py +++ b/patch_files/horovod/torch/mpi_ops.py @@ -20,37 +20,25 @@ import warnings +from horovod.torch import mpi_lib_v2 as mpi_lib from horovod.common.basics import HorovodBasics as _HorovodBasics +_NULL = "" +_basics = _HorovodBasics(__file__, 'mpi_lib_v2') + from horovod.common.exceptions import HorovodInternalError -from horovod.common.process_sets import _setup as _setup_process_sets -from horovod.common.process_sets import ProcessSet, global_process_set, add_process_set, remove_process_set -from horovod.common.util import check_installed_version, get_average_backwards_compatibility_fun, gpu_available, num_rank_is_power_2 +from horovod.common.util import get_average_backwards_compatibility_fun, gpu_available, num_rank_is_power_2 from horovod.torch.compression import Compression -# Check possible symbol not found error from pytorch version mismatch -try: - from horovod.torch import mpi_lib_v2 as mpi_lib -except Exception as e: - check_installed_version('pytorch', torch.__version__, e) - raise e -else: - check_installed_version('pytorch', torch.__version__) - -_NULL = "" - -_basics = _HorovodBasics(__file__, 'mpi_lib_v2') - # import basic methods +init = _basics.init is_initialized = _basics.is_initialized start_timeline = _basics.start_timeline stop_timeline = _basics.stop_timeline size = _basics.size local_size = _basics.local_size -cross_size = _basics.cross_size rank = _basics.rank local_rank = _basics.local_rank -cross_rank = _basics.cross_rank mpi_threads_supported = _basics.mpi_threads_supported mpi_enabled = _basics.mpi_enabled mpi_built = _basics.mpi_built @@ -61,32 +49,19 @@ ccl_built = _basics.ccl_built cuda_built = _basics.cuda_built rocm_built = _basics.rocm_built - def shutdown(*args, **kwargs): mpi_lib.horovod_torch_reset() return _basics.shutdown(*args, **kwargs) -def init(*args, **kwargs): - global _handle_map - _handle_map = {} - _basics.init(*args, **kwargs) - # Call set up again to make sure the basics is in sync - _setup_process_sets(_basics) - # import reduction op values Average = _basics.Average Sum = _basics.Sum Adasum = _basics.Adasum -Min = _basics.Min -Max = _basics.Max -Product = _basics.Product is_homogeneous = _basics.is_homogeneous handle_average_backwards_compatibility = get_average_backwards_compatibility_fun(_basics) -_setup_process_sets(_basics) - # Schema: handle -> input, output # We keep input in order to make sure it does not get garbage collected @@ -107,19 +82,17 @@ def _allreduce_function_factory(tensor): return 'horovod_torch_allreduce_async_' + tensor.type().replace('.', '_') -def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor, process_set: ProcessSet): +def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor): # Set the divisor for reduced gradients to average when necessary if op == Average: if rocm_built(): # For ROCm, perform averaging at framework level - divisor = process_set.size() + divisor = size() op = Sum else: divisor = 1 elif op == Adasum: - if process_set != global_process_set: - raise NotImplementedError("Adasum does not support non-global process sets yet.") if tensor.device.type != 'cpu' and gpu_available('torch'): if nccl_built(): if not is_homogeneous(): @@ -147,7 +120,7 @@ def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor try: handle = getattr(mpi_lib, function)(tensor, output, divisor, name.encode() if name is not None else _NULL, op, - prescale_factor, postscale_factor, process_set.process_set_id) + prescale_factor, postscale_factor) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tensor, output) @@ -155,8 +128,7 @@ def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor def allreduce_async(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): + prescale_factor=1.0, postscale_factor=1.0): """ A function that performs asynchronous averaging or summation of the input tensor over all the Horovod processes. The input tensor is not modified. @@ -171,16 +143,13 @@ def allreduce_async(tensor, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v1.0. + Use `op` instead. Will be removed in v0.21.0. name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. + op: The reduction operation to combine tensors across different + ranks. Defaults to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A handle to the allreduce operation that can be used with `poll()` or @@ -188,32 +157,30 @@ def allreduce_async(tensor, average=None, name=None, op=None, """ op = handle_average_backwards_compatibility(op, average) output = tensor.new(tensor.shape) - return _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor, process_set) + return _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor) class HorovodAllreduce(torch.autograd.Function): """An autograd function that performs allreduce on a tensor.""" @staticmethod - def forward(ctx, tensor, average, name, op, prescale_factor, postscale_factor, process_set): + def forward(ctx, tensor, average, name, op, prescale_factor, postscale_factor): ctx.average = average ctx.op = op ctx.prescale_factor = prescale_factor ctx.postscale_factor = postscale_factor - ctx.process_set = process_set - handle = allreduce_async(tensor, average, name, op, prescale_factor, postscale_factor, process_set) + handle = allreduce_async(tensor, average, name, op, prescale_factor, postscale_factor) return synchronize(handle) @staticmethod def backward(ctx, grad_output): return allreduce(grad_output, average=ctx.average, op=ctx.op, prescale_factor=ctx.prescale_factor, - postscale_factor=ctx.postscale_factor, - process_set=ctx.process_set), None, None, None, None, None, None + postscale_factor=ctx.postscale_factor), None, None, None, None, None def allreduce(tensor, average=None, name=None, compression=Compression.none, op=None, - prescale_factor=1.0, postscale_factor=1.0, process_set=global_process_set): + prescale_factor=1.0, postscale_factor=1.0): """ A function that performs averaging or summation of the input tensor over all the Horovod processes. The input tensor is not modified. @@ -232,19 +199,16 @@ def allreduce(tensor, average=None, name=None, compression=Compression.none, op= average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v1.0. + Use `op` instead. Will be removed in v0.21.0. name: A name of the reduction operation. compression: Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Defaults to not using compression. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults + op: The reduction operation to combine tensors across different ranks. Defaults to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, averaged or summed across all @@ -252,14 +216,12 @@ def allreduce(tensor, average=None, name=None, compression=Compression.none, op= """ tensor_compressed, ctx = compression.compress(tensor) summed_tensor_compressed = HorovodAllreduce.apply(tensor_compressed, average, name, op, - prescale_factor, postscale_factor, - process_set) + prescale_factor, postscale_factor) return compression.decompress(summed_tensor_compressed, ctx) def allreduce_async_(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): + prescale_factor=1.0, postscale_factor=1.0): """ A function that performs asynchronous in-place averaging or summation of the input tensor over all the Horovod processes. @@ -274,28 +236,24 @@ def allreduce_async_(tensor, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v1.0. + Use `op` instead. Will be removed in v0.21.0. name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. + op: The reduction operation to combine tensors across different ranks. Defaults to + Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A handle to the allreduce operation that can be used with `poll()` or `synchronize()`. """ op = handle_average_backwards_compatibility(op, average) - return _allreduce_async(tensor, tensor, name, op, prescale_factor, postscale_factor, process_set) + return _allreduce_async(tensor, tensor, name, op, prescale_factor, postscale_factor) def allreduce_(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): + prescale_factor=1.0, postscale_factor=1.0): """ A function that performs in-place averaging or summation of the input tensor over all the Horovod processes. @@ -310,22 +268,19 @@ def allreduce_(tensor, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v1.0. + Use `op` instead. Will be removed in v0.21.0. name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. + op: The reduction operation to combine tensors across different ranks. Defaults to + Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, averaged or summed across all processes. """ - handle = allreduce_async_(tensor, average, name, op, prescale_factor, postscale_factor, process_set) + handle = allreduce_async_(tensor, average, name, op, prescale_factor, postscale_factor) return synchronize(handle) @@ -333,18 +288,16 @@ def _grouped_allreduce_function_factory(tensor): return 'horovod_torch_grouped_allreduce_async_' + tensor.type().replace('.', '_') -def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor, process_set: ProcessSet): +def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor): # Set the divisor for reduced gradients to average when necessary if op == Average: if rocm_built(): # For ROCm, perform averaging at framework level - divisor = process_set.size() + divisor = size() op = Sum else: divisor = 1 elif op == Adasum: - if process_set != global_process_set: - raise NotImplementedError("Adasum does not support non-global process sets yet.") if tensors[0].device.type != 'cpu' and gpu_available('torch'): if nccl_built(): if not is_homogeneous(): @@ -372,7 +325,7 @@ def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postsc try: handle = getattr(mpi_lib, function)(tensors, outputs, divisor, name.encode() if name is not None else _NULL, op, - prescale_factor, postscale_factor, process_set.process_set_id) + prescale_factor, postscale_factor) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tuple(tensors), tuple(outputs)) @@ -380,8 +333,7 @@ def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postsc def grouped_allreduce_async(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): + prescale_factor=1.0, postscale_factor=1.0): """ A function that performs asynchronous averaging or summation of the input tensor list over all the Horovod processes. The input tensors are not modified. @@ -398,16 +350,13 @@ def grouped_allreduce_async(tensors, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v1.0. + Use `op` instead. Will be removed in v0.21.0. name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. + op: The reduction operation to combine tensors across different + ranks. Defaults to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A handle to the group allreduce operation that can be used with `poll()` or @@ -415,34 +364,31 @@ def grouped_allreduce_async(tensors, average=None, name=None, op=None, """ op = handle_average_backwards_compatibility(op, average) outputs = [t.new(t.shape) for t in tensors] - return _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor, process_set) + return _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor) class HorovodGroupedAllreduce(torch.autograd.Function): """An autograd function that performs allreduce on a list of tensors.""" @staticmethod - def forward(ctx, average, name, op, prescale_factor, postscale_factor, process_set: ProcessSet, *tensors): + def forward(ctx, average, name, op, prescale_factor, postscale_factor, *tensors): ctx.average = average ctx.op = op ctx.prescale_factor = prescale_factor ctx.postscale_factor = postscale_factor - ctx.process_set = process_set - handle = grouped_allreduce_async(list(tensors), average, name, op, prescale_factor, postscale_factor, - process_set) + handle = grouped_allreduce_async(list(tensors), average, name, op, prescale_factor, postscale_factor) return synchronize(handle) @staticmethod def backward(ctx, *grad_output): grad_reduced = grouped_allreduce(list(grad_output), average=ctx.average, op=ctx.op, prescale_factor=ctx.prescale_factor, - postscale_factor=ctx.postscale_factor, - process_set=ctx.process_set) - return (None, None, None, None, None, None, *grad_reduced) + postscale_factor=ctx.postscale_factor) + return (None, None, None, None, None, *grad_reduced) def grouped_allreduce(tensors, average=None, name=None, compression=Compression.none, op=None, - prescale_factor=1.0, postscale_factor=1.0, process_set=global_process_set): + prescale_factor=1.0, postscale_factor=1.0): """ A function that performs averaging or summation of the input tensor list over all the Horovod processes. The input tensors are not modified. @@ -463,19 +409,16 @@ def grouped_allreduce(tensors, average=None, name=None, compression=Compression. average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v1.0. + Use `op` instead. Will be removed in v0.21.0. name: A base name to use for the group reduction operation. compression: Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Defaults to not using compression. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults + op: The reduction operation to combine tensors across different ranks. Defaults to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A list containing tensors of the same shape and type as in `tensors`, @@ -484,13 +427,12 @@ def grouped_allreduce(tensors, average=None, name=None, compression=Compression. tensors_compressed, ctxs = zip(*[compression.compress(t) for t in tensors]) summed_tensors_compressed = HorovodGroupedAllreduce.apply(average, name, op, prescale_factor, postscale_factor, - process_set, *tensors_compressed) + *tensors_compressed) return [compression.decompress(t, ctx) for t, ctx in zip(summed_tensors_compressed, ctxs)] def grouped_allreduce_async_(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): + prescale_factor=1.0, postscale_factor=1.0): """ A function that performs asynchronous in-place averaging or summation of the input tensors over all the Horovod processes. @@ -507,28 +449,24 @@ def grouped_allreduce_async_(tensors, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v1.0. + Use `op` instead. Will be removed in v0.21.0. name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. + op: The reduction operation to combine tensors across different ranks. Defaults to + Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A handle to the group allreduce operation that can be used with `poll()` or `synchronize()`. """ op = handle_average_backwards_compatibility(op, average) - return _grouped_allreduce_async(tensors, tensors, name, op, prescale_factor, postscale_factor, process_set) + return _grouped_allreduce_async(tensors, tensors, name, op, prescale_factor, postscale_factor) def grouped_allreduce_(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): + prescale_factor=1.0, postscale_factor=1.0): """ A function that performs in-place averaging or summation of the input tensors over all the Horovod processes. @@ -545,66 +483,38 @@ def grouped_allreduce_(tensors, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v1.0. + Use `op` instead. Will be removed in v0.21.0. name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. + op: The reduction operation to combine tensors across different ranks. Defaults to + Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A list containing tensors of the same shape and type as in `tensors`, averaged or summed across all processes. """ - handle = grouped_allreduce_async_(tensors, average, name, op, prescale_factor, postscale_factor, process_set) + handle = grouped_allreduce_async_(tensors, average, name, op, prescale_factor, postscale_factor) return synchronize(handle) -def sparse_allreduce_async(tensor, name, op, process_set=global_process_set): - # Allgather aggregates along the first dimension, so we need to transpose the - # indices to enforce correct concatenation behavior, then transpose back prior to - # constructing the new aggregated sparse gradient - t = tensor - indices_handle = allgather_async(t._indices().transpose(0, 1).contiguous(), name=f'{name}.indices', - process_set=process_set) - values_handle = allgather_async(t._values(), name=f'{name}.values', process_set=process_set) - - def handle(): - # We need to sync values handle firstly for torch nightly >= 10.0 - # Issue: https://github.com/horovod/horovod/issues/2961 - values = synchronize(values_handle) - indices = synchronize(indices_handle) - - values = (values / process_set.size()) if op == Average else values - - if indices.dim() == 0 or values.dim() == 0: - return t.new().resize_as_(t) - return t.new(indices.transpose(0, 1), values, t.size()) - - return handle - - def _allgather_function_factory(tensor): return 'horovod_torch_allgather_async_' + tensor.type().replace('.', '_') -def _allgather_async(tensor, output, name, process_set: ProcessSet): +def _allgather_async(tensor, output, name): function = _check_function(_allgather_function_factory, tensor) try: handle = getattr(mpi_lib, function)( - tensor, output, name.encode() if name is not None else _NULL, - process_set.process_set_id) + tensor, output, name.encode() if name is not None else _NULL) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tensor, output) return handle -def allgather_async(tensor, name=None, process_set=global_process_set): +def allgather_async(tensor, name=None): """ A function that asynchronously concatenates the input tensor with the same input tensor on all other Horovod processes. The input tensor is not modified. @@ -616,47 +526,44 @@ def allgather_async(tensor, name=None, process_set=global_process_set): Arguments: tensor: A tensor to allgather. name: A name of the allgather operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A handle to the allgather operation that can be used with `poll()` or `synchronize()`. """ output = tensor.new() - return _allgather_async(tensor, output, name, process_set) + return _allgather_async(tensor, output, name) class HorovodAllgather(torch.autograd.Function): """An autograd function that performs allgather on a tensor.""" @staticmethod - def forward(ctx, tensor, name, process_set: ProcessSet): + def forward(ctx, tensor, name): ctx.dim = tensor.shape[0] - ctx.process_set = process_set - handle = allgather_async(tensor, name, process_set) + handle = allgather_async(tensor, name) return synchronize(handle) @staticmethod def backward(ctx, grad_output): - grad_reduced = allreduce(grad_output, average=False, process_set=ctx.process_set)#average CHANGE + grad_reduced = allreduce(grad_output, average=False) dim_t = torch.IntTensor([ctx.dim]) - dim = allgather(dim_t, process_set=ctx.process_set).view(ctx.process_set.size()) + dim = allgather(dim_t).view(size()) - r = ctx.process_set.rank() + r = rank() offset = torch.sum(dim.narrow(0, 0, r)).item() if r != 0 else 0 - return grad_reduced.narrow(0, offset, ctx.dim), None, None + return grad_reduced.narrow(0, offset, ctx.dim), None -def allgather(tensor, name=None, process_set=global_process_set): +def allgather(tensor, name=None): """ A function that concatenates the input tensor with the same input tensor on all other Horovod processes. The input tensor is not modified. - The concatenation is done on the first dimension, so the corresponding - input tensors on the different processes must have the same rank and - shape, except for the first dimension, which is allowed to be different. + The concatenation is done on the first dimension, so the input tensors on the + different processes must have the same rank and shape, except for the first + dimension, which is allowed to be different. This acts as a thin wrapper around an autograd function. If your input tensor requires gradients, then callings this function will allow gradients @@ -665,8 +572,6 @@ def allgather(tensor, name=None, process_set=global_process_set): Arguments: tensor: A tensor to allgather. name: A name of the allgather operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A tensor of the same type as `tensor`, concatenated on dimension zero @@ -674,118 +579,25 @@ def allgather(tensor, name=None, process_set=global_process_set): the first dimension, which may be greater and is the sum of all first dimensions of the tensors in different Horovod processes. """ - return HorovodAllgather.apply(tensor, name, process_set) - - -def _grouped_allgather_function_factory(tensor): - return 'horovod_torch_grouped_allgather_async_' + tensor.type().replace('.', '_') - - -def _grouped_allgather_async(tensors, outputs, name, process_set: ProcessSet): - function = _check_function(_grouped_allgather_function_factory, tensors[0]) - try: - handle = getattr(mpi_lib, function)( - tensors, outputs, name.encode() if name is not None else _NULL, - process_set.process_set_id) - except RuntimeError as e: - raise HorovodInternalError(e) - _handle_map[handle] = (tuple(tensors), tuple(outputs)) - return handle - - -def grouped_allgather_async(tensors, name=None, process_set=global_process_set): - """ - A function that asynchronously concatenates each input tensor with the corresponding - input tensor on all other Horovod processes for a list of input tensors. The input - tensors are not modified. - - The concatenation is done on the first dimension, so the input tensors on the - different processes must have the same rank and shape, except for the first - dimension, which is allowed to be different. - - Arguments: - tensors: A list of tensors to allgather. - name: A base name to use for the group allgather operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A handle to the group allgather operation that can be used with `poll()` or - `synchronize()`. - """ - outputs = [t.new() for t in tensors] - return _grouped_allgather_async(tensors, outputs, name, process_set) - - -class HorovodGroupedAllgather(torch.autograd.Function): - """An autograd function that performs allgather on a list of tensors.""" - - @staticmethod - def forward(ctx, name, process_set: ProcessSet, *tensors): - ctx.dims = [t.shape[0] for t in tensors] - ctx.process_set = process_set - handle = grouped_allgather_async(list(tensors), name, process_set) - return synchronize(handle) - - @staticmethod - def backward(ctx, *grad_output): - grad_reduced = grouped_allreduce(list(grad_output), average=True, process_set=ctx.process_set) - - dim_ts = [torch.IntTensor([dim]) for dim in ctx.dims] - dims = [dt.view(ctx.process_set.size()) - for dt in grouped_allgather(dim_ts, process_set=ctx.process_set)] - - r = ctx.process_set.rank() - offsets = [torch.sum(dim.narrow(0, 0, r)).item() if r != 0 else 0 - for dim in dims] - result = [gr.narrow(0, offset, dim) - for gr, offset, dim in zip(grad_reduced, offsets, ctx.dims)] - return (None, None, *result) - - -def grouped_allgather(tensors, name=None, process_set=global_process_set): - """ - A function that concatenates each input tensor with the corresponding - input tensor on all other Horovod processes for a list of input tensors. - The input tensors are not modified. - - The concatenation is done on the first dimension, so the corresponding input - tensors on the different processes must have the same rank and shape, except - for the first dimension, which is allowed to be different. - - Arguments: - tensors: A list of tensors to allgather. - name: A base name to use for the group allgather operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A list containing tensors of the same type as in `tensors`. Each tensor - is concatenated on dimension zero across all processes. Its shape is - identical to the corresponding input shape, expect for the first - dimension, which may be greater and is the sum of all first dimensions - of the corresponding tensor in different Horovod processes. - """ - return HorovodGroupedAllgather.apply(name, process_set, *tensors) + return HorovodAllgather.apply(tensor, name) def _broadcast_function_factory(tensor): return 'horovod_torch_broadcast_async_' + tensor.type().replace('.', '_') -def _broadcast_async(tensor, output, root_rank, name, process_set: ProcessSet): +def _broadcast_async(tensor, output, root_rank, name): function = _check_function(_broadcast_function_factory, tensor) try: handle = getattr(mpi_lib, function)( - tensor, output, root_rank, name.encode() if name is not None else _NULL, - process_set.process_set_id) + tensor, output, root_rank, name.encode() if name is not None else _NULL) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tensor, output) return handle -def broadcast_async(tensor, root_rank, name=None, process_set=global_process_set): +def broadcast_async(tensor, root_rank, name=None): """ A function that asynchronously broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The input tensor is not modified. @@ -799,36 +611,33 @@ def broadcast_async(tensor, root_rank, name=None, process_set=global_process_set tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A handle to the broadcast operation that can be used with `poll()` or `synchronize()`. """ output = tensor.new(tensor.shape) - return _broadcast_async(tensor, output, root_rank, name, process_set) + return _broadcast_async(tensor, output, root_rank, name) class HorovodBroadcast(torch.autograd.Function): """An autograd function that broadcasts a tensor.""" @staticmethod - def forward(ctx, tensor, root_rank, name, process_set: ProcessSet): + def forward(ctx, tensor, root_rank, name): ctx.root_rank = root_rank - ctx.process_set = process_set - handle = broadcast_async(tensor, root_rank, name, process_set) + handle = broadcast_async(tensor, root_rank, name) return synchronize(handle) @staticmethod def backward(ctx, grad_output): - grad_reduced = allreduce(grad_output, average=False, process_set=ctx.process_set)#True CHANGED + grad_reduced = allreduce(grad_output, average=False) if rank() != ctx.root_rank: grad_reduced *= 0 - return grad_reduced, None, None, None + return grad_reduced, None, None -def broadcast(tensor, root_rank, name=None, process_set=global_process_set): +def broadcast(tensor, root_rank, name=None): """ A function that broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The input tensor is not modified. @@ -846,17 +655,15 @@ def broadcast(tensor, root_rank, name=None, process_set=global_process_set): tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, with the value broadcasted from root rank. """ - return HorovodBroadcast.apply(tensor, root_rank, name, process_set) + return HorovodBroadcast.apply(tensor, root_rank, name) -def broadcast_async_(tensor, root_rank, name=None, process_set=global_process_set): +def broadcast_async_(tensor, root_rank, name=None): """ A function that asynchronously broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The operation is performed in-place. @@ -870,17 +677,15 @@ def broadcast_async_(tensor, root_rank, name=None, process_set=global_process_se tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A handle to the broadcast operation that can be used with `poll()` or `synchronize()`. """ - return _broadcast_async(tensor, tensor, root_rank, name, process_set) + return _broadcast_async(tensor, tensor, root_rank, name) -def broadcast_(tensor, root_rank, name=None, process_set=global_process_set): +def broadcast_(tensor, root_rank, name=None): """ A function that broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The operation is performed in-place. @@ -894,37 +699,35 @@ def broadcast_(tensor, root_rank, name=None, process_set=global_process_set): tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, with the value broadcasted from root rank. """ - handle = broadcast_async_(tensor, root_rank, name, process_set) + handle = broadcast_async_(tensor, root_rank, name) return synchronize(handle) def _alltoall_function_factory(tensor): return 'horovod_torch_alltoall_async_' + tensor.type().replace('.', '_') -def _alltoall_async(tensor, splits, output, output_received_splits, name, process_set: ProcessSet): +def _alltoall_async(tensor, splits, output, name): if splits is None: # If splits not provided, create empty tensor as placeholder splits = torch.tensor([], dtype=torch.int32, device='cpu') elif not isinstance(splits, torch.Tensor): splits = torch.tensor(splits, dtype=torch.int32, device='cpu') + function = _check_function(_alltoall_function_factory, tensor) try: handle = getattr(mpi_lib, function)( - tensor, splits, output, output_received_splits, name.encode() if name is not None else _NULL, - process_set.process_set_id) + tensor, splits, output, name.encode() if name is not None else _NULL) except RuntimeError as e: raise HorovodInternalError(e) - _handle_map[handle] = (tensor, splits, (output, output_received_splits)) + _handle_map[handle] = (tensor, splits, output) return handle -def alltoall_async(tensor, splits=None, name=None, process_set=global_process_set): +def alltoall_async(tensor, splits=None, name=None): """ A function that scatters slices of the input tensor to all other Horovod processes and returns a tensor of gathered slices from all other Horovod processes. The input @@ -942,45 +745,37 @@ def alltoall_async(tensor, splits=None, name=None, process_set=global_process_se not provided, the first dimension is split equally by the number of Horovod processes. name: A name of the alltoall operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: A handle to the alltoall operation that can be used with `poll()` or `synchronize()`. """ output = tensor.new() - if isinstance(splits, torch.Tensor): - output_received_splits = splits.new() - else: - output_received_splits = torch.empty(size(), dtype=torch.int32, device='cpu') - return _alltoall_async(tensor, splits, output, output_received_splits, name, process_set) + return _alltoall_async(tensor, splits, output, name) class HorovodAlltoall(torch.autograd.Function): """An autograd function that performs alltoall on a tensor.""" @staticmethod - def forward(ctx, tensor, splits, name, process_set: ProcessSet): - handle = alltoall_async(tensor, splits, name, process_set) - output, received_splits = synchronize(handle) - - ctx.process_set = process_set - ctx.recvsplits = received_splits - if splits is None: - return output - else: - ctx.mark_non_differentiable(received_splits) - return output, received_splits + def forward(ctx, tensor, splits, name): + ctx.tensor = tensor + ctx.splits = splits + handle = alltoall_async(tensor, splits, name) + return synchronize(handle) @staticmethod - def backward(ctx, grad_output, *dead_gradients): - grad_wrt_tensor, _ = alltoall(grad_output, splits=ctx.recvsplits, - process_set=ctx.process_set) - return grad_wrt_tensor, None, None, None + def backward(ctx, grad_output): + recvsplits = None + if ctx.splits is not None: + recvsplits = alltoall(ctx.splits, splits=torch.ones(size(), dtype=torch.int32, device='cpu')) + else: + splits_equal = torch.ones(size(), dtype=torch.int32, device='cpu') * (ctx.tensor.size()[0] // size()) + recvsplits = alltoall(splits_equal, splits=torch.ones(size(), dtype=torch.int32, device='cpu')) + return alltoall(grad_output, splits=recvsplits), None, None -def alltoall(tensor, splits=None, name=None, process_set=global_process_set): +def alltoall(tensor, splits=None, name=None): """ A function that scatters slices of the input tensor to all other Horovod processes and returns a tensor of gathered slices from all other Horovod processes. The input @@ -1002,263 +797,22 @@ def alltoall(tensor, splits=None, name=None, process_set=global_process_set): not provided, the first dimension is split equally by the number of Horovod processes. name: A name of the alltoall operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. Returns: - 1) A tensor containing the gathered tensor data from all workers. - 2) If `splits` has been provided: A tensor of integers in rank order - describing how many elements in the output tensor have been received - from each worker. - """ - return HorovodAlltoall.apply(tensor, splits, name, process_set) - - -def _reducescatter_function_factory(tensor): - return 'horovod_torch_reducescatter_async_' + tensor.type().replace('.', '_') - - -def _reducescatter_async(tensor, output, name, op, process_set: ProcessSet, - prescale_factor, postscale_factor): - function = _check_function(_reducescatter_function_factory, tensor) - try: - handle = getattr(mpi_lib, function)(tensor, output, - name.encode() if name is not None else _NULL, - op, process_set.process_set_id, - prescale_factor, postscale_factor) - except RuntimeError as e: - raise HorovodInternalError(e) - _handle_map[handle] = (tensor, output) - return handle - - -def reducescatter_async(tensor, name=None, op=Average, process_set=global_process_set, - prescale_factor=1.0, postscale_factor=1.0): + A tensor containing the gathered tensor data from all workers. """ - A function that performs asynchronous reduction of the input tensor over all the - Horovod processes, then scatters the results across all Horovod processes. The - input tensor is not modified. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The reduction will not start until all processes - are ready to send and receive the tensor. - - The input tensors on the different processes must have the same rank and shape. The - output tensor will be the same rank on all processes, but the first dimension may - be different. - - Arguments: - tensor: A tensor to average and sum. - name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. - Defaults to Average. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - prescale_factor: Multiplicative factor to scale tensor before reducescatter. - postscale_factor: Multiplicative factor to scale tensor after reducescatter. - - Returns: - A handle to the reducescatter operation that can be used with `poll()` or - `synchronize()`. - """ - output = tensor.new() - return _reducescatter_async(tensor, output, name, op, process_set, - prescale_factor, postscale_factor) - - -class HorovodReducescatter(torch.autograd.Function): - """An autograd function that performs reducescatter on a tensor.""" - - @staticmethod - def forward(ctx, tensor, name, op, process_set, prescale_factor, postscale_factor): - ctx.op = op - ctx.process_set = process_set - ctx.prescale_factor = prescale_factor - ctx.postscale_factor = postscale_factor - handle = reducescatter_async(tensor, name, op, process_set, prescale_factor, postscale_factor) - return synchronize(handle) - - @staticmethod - def backward(ctx, grad_output): - if ctx.op == Sum: - grad_output *= ctx.process_set.size() - if ctx.prescale_factor != 1.0: - grad_output *= ctx.prescale_factor - if ctx.postscale_factor != 1.0: - grad_output *= ctx.postscale_factor - - return allgather(grad_output, process_set=ctx.process_set), None, None, None, None, None - - -def reducescatter(tensor, name=None, compression=Compression.none, op=Average, - process_set=global_process_set, prescale_factor=1.0, postscale_factor=1.0): - """ - A function that performs reduction of the input tensor over all the Horovod - processes, then scatters the results across all Horovod processes. The input tensor - is not modified. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The reduction will not start until all processes - are ready to send and receive the tensor. - - Arguments: - tensor: A tensor to average/sum and scatter. - name: A name of the reduction operation. - compression: Compression algorithm used during reducescatter to reduce the amount - of data sent during the each parameter update step. Defaults to - not using compression. - op: The reduction operation to combine tensors across different ranks. - Defaults to Average. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - prescale_factor: Multiplicative factor to scale tensor before reducescatter. - postscale_factor: Multiplicative factor to scale tensor after reducescatter. - - Returns: - A tensor of the same rank and type as `tensor` across all processes. The shape - is identical to the input shape, except for the first dimension, which will be - divided across the different Horovod processes. - """ - tensor_compressed, ctx = compression.compress(tensor) - reduced_tensor_compressed = HorovodReducescatter.apply(tensor_compressed, name, op, process_set, - prescale_factor, postscale_factor) - return compression.decompress(reduced_tensor_compressed, ctx) - - -def _grouped_reducescatter_function_factory(tensor): - return 'horovod_torch_grouped_reducescatter_async_' + tensor.type().replace('.', '_') - - -def _grouped_reducescatter_async(tensors, outputs, name, op, process_set: ProcessSet, - prescale_factor, postscale_factor): - function = _check_function(_grouped_reducescatter_function_factory, tensors[0]) - try: - handle = getattr(mpi_lib, function)(tensors, outputs, - name.encode() if name is not None else _NULL, - op, process_set.process_set_id, - prescale_factor, postscale_factor) - except RuntimeError as e: - raise HorovodInternalError(e) - _handle_map[handle] = (tuple(tensors), tuple(outputs)) - return handle - - -def grouped_reducescatter_async(tensors, name=None, op=Average, process_set=global_process_set, - prescale_factor=1.0, postscale_factor=1.0): - """ - A function that performs asynchronous reduction of a list of input tensors over all the - Horovod processes, then scatters the results across all Horovod processes. The - input tensors are not modified. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. For each of the input tensors, the type and shape must - be the same on all Horovod processes for a given name. The reduction will not start - until all processes are ready to send and receive the tensors. - - The input tensor at some place in the list tensor must have the same rank and shape - on the different processes. The corresponding output tensor will be the same rank on - all processes, but the first dimension may be different. - - Arguments: - tensors: A list of tensors to average and sum. - name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. - Defaults to Average. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - prescale_factor: Multiplicative factor to scale tensors before reducescatter. - postscale_factor: Multiplicative factor to scale tensors after reducescatter. - - Returns: - A handle to the group reducescatter operation that can be used with `poll()` or - `synchronize()`. - """ - outputs = [t.new() for t in tensors] - return _grouped_reducescatter_async(tensors, outputs, name, op, process_set, - prescale_factor, postscale_factor) - - -class HorovodGroupedReducescatter(torch.autograd.Function): - """An autograd function that performs reducescatter on a list of tensors.""" - - @staticmethod - def forward(ctx, name, op, process_set, prescale_factor, postscale_factor, *tensors): - ctx.op = op - ctx.process_set = process_set - ctx.prescale_factor = prescale_factor - ctx.postscale_factor = postscale_factor - handle = grouped_reducescatter_async(list(tensors), name, op, process_set, - prescale_factor, postscale_factor, ) - return synchronize(handle) - - @staticmethod - def backward(ctx, *grad_output): - if ctx.op == Sum: - grad_output = [g * ctx.process_set.size() for g in grad_output] - if ctx.prescale_factor != 1.0: - grad_output = [ctx.prescale_factor * g for g in grad_output] - if ctx.postscale_factor != 1.0: - grad_output = [ctx.postscale_factor * g for g in grad_output] - - return (None, None, None, None, None, - *grouped_allgather(grad_output, process_set=ctx.process_set)) - - -def grouped_reducescatter(tensors, name=None, compression=Compression.none, op=Average, - process_set=global_process_set, prescale_factor=1.0, postscale_factor=1.0): - """ - A function that performs reduction of a list of input tensors over all the - Horovod processes, then scatters the results across all Horovod processes. The - input tensors are not modified. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The reduction will not start until all processes - are ready to send and receive the tensor. - - Arguments: - tensors: A list of tensors to average and sum. - name: A base name to use for the group reduction operation. - compression: Compression algorithm used during reducescatter to reduce the amount - of data sent during the each parameter update step. Defaults to - not using compression. - op: The reduction operation to combine tensors across different ranks. - Defaults to Average. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - prescale_factor: Multiplicative factor to scale tensors before reducescatter. - postscale_factor: Multiplicative factor to scale tensors after reducescatter. - - - Returns: - A list containing tensors of the same rank and type as in `tensors`. For each - tensor the shape is identical to the input shape, except for the first dimension, - which will be divided across the different Horovod processes. - """ - tensors_compressed = [] - ctxs = [] - for tensor in tensors: - tensor_compressed, ctx = compression.compress(tensor) - tensors_compressed.append(tensor_compressed) - ctxs.append(ctx) - reduced_tensors_compressed = HorovodGroupedReducescatter.apply(name, op, process_set, - prescale_factor, postscale_factor, - *tensors_compressed) - return [compression.decompress(reduced_tensor_compressed, ctx) - for reduced_tensor_compressed, ctx in zip(reduced_tensors_compressed, ctxs)] + return HorovodAlltoall.apply(tensor, splits, name) def poll(handle): """ - Polls an allreduce, allgather, alltoall, broadcast, or reducescatter handle to determine whether - underlying asynchronous operation has completed. After `poll()` returns `True`, - `synchronize()` will return without blocking. + Polls an allreduce, allgather or broadcast handle to determine whether underlying + asynchronous operation has completed. After `poll()` returns `True`, `synchronize()` + will return without blocking. Arguments: - handle: A handle returned by an allreduce, allgather, alltoall, broadcast, or reducescatter - asynchronous operation. + handle: A handle returned by an allreduce, allgather or broadcast asynchronous + operation. Returns: A flag indicating whether the operation has completed. @@ -1268,15 +822,15 @@ def poll(handle): def synchronize(handle): """ - Synchronizes an asynchronous allreduce, allgather, alltoall, broadcast, or reducescatter operation - until it's completed. Returns the result of the operation. + Synchronizes an asynchronous allreduce, allgather or broadcast operation until + it's completed. Returns the result of the operation. Arguments: - handle: A handle returned by an allreduce, allgather, alltoall, broadcast, or reducescatter - asynchronous operation. + handle: A handle returned by an allreduce, allgather or broadcast asynchronous + operation. Returns: - A single output tensor of the operation or a tuple of multiple output tensors. + An output tensor of the operation. """ if handle not in _handle_map: return @@ -1286,11 +840,10 @@ def synchronize(handle): output = _handle_map.pop(handle)[-1] return output except RuntimeError as e: - _handle_map.pop(handle, None) raise HorovodInternalError(e) -def join(device=-1) -> int: +def join(device=-1): """A function that indicates that the rank finished processing data. All ranks that did not call join() continue to process allreduce operations. @@ -1302,32 +855,7 @@ def join(device=-1) -> int: Returns: Id of the rank that joined last. """ - output = torch.tensor(-1, dtype=torch.int, device=torch.device("cpu")) - try: - handle = mpi_lib.horovod_torch_join(output, device) - except RuntimeError as e: - raise HorovodInternalError(e) - - _handle_map[handle] = (None, output) - - return synchronize(handle).item() - -def barrier(process_set=global_process_set): - """ - A function that acts as a simple sychronization point for ranks specified - in the given process group(default to global group). Ranks that reach - this function call will stall until all other ranks have reached. - - Arguments: - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - """ - try: - handle = mpi_lib.horovod_torch_barrier(process_set.process_set_id) + return mpi_lib.horovod_torch_join(device) except RuntimeError as e: raise HorovodInternalError(e) - - _handle_map[handle] = (None, None) - - synchronize(handle) diff --git a/patch_files/horovod/torch/optimizer.py b/patch_files/horovod/torch/optimizer.py index 7e3f3b1..ffe7d42 100644 --- a/patch_files/horovod/torch/optimizer.py +++ b/patch_files/horovod/torch/optimizer.py @@ -25,32 +25,28 @@ from horovod.torch.compression import Compression from horovod.torch.functions import broadcast_object -from horovod.torch.mpi_ops import allreduce_async_, grouped_allreduce_async_, sparse_allreduce_async +from horovod.torch.mpi_ops import allreduce_async_, grouped_allreduce_async_ from horovod.torch.mpi_ops import synchronize from horovod.torch.mpi_ops import size from horovod.torch.mpi_ops import Average, Adasum, Sum from horovod.torch.mpi_ops import rocm_built -from horovod.torch.mpi_ops import ProcessSet, global_process_set class _DistributedOptimizer(torch.optim.Optimizer): - def __init__(self, params, named_parameters,grace, compression, + def __init__(self, params, named_parameters, grace, compression, backward_passes_per_step=1, op=Average, gradient_predivide_factor=1.0, - groups=None, - sparse_as_dense=False, - process_set=global_process_set): + num_groups=0): super(self.__class__, self).__init__(params) - self._grace = grace - self._process_set=process_set self._compression = compression + self._grace = grace if named_parameters is not None: named_parameters = list(named_parameters) else: - named_parameters = [(f'allreduce.noname.{i}.{j}', v) - for i, param_group in enumerate(self.param_groups) - for j, v in enumerate(param_group['params'])] + named_parameters = [('allreduce.noname.%s' % i, v) + for param_group in self.param_groups + for i, v in enumerate(param_group['params'])] # make sure that named_parameters are tuples if any([not isinstance(p, tuple) for p in named_parameters]): raise ValueError('named_parameters should be a sequence of ' @@ -78,33 +74,15 @@ def __init__(self, params, named_parameters,grace, compression, for _, v in sorted(named_parameters)} self.op = op self.gradient_predivide_factor = gradient_predivide_factor - self.sparse_as_dense = sparse_as_dense - self.process_set = process_set - self._handles = {} self._grad_accs = [] self._requires_update = set() self._synchronized = False self._should_synchronize = True - - if groups is not None: - if not (isinstance(groups, list) or groups > 0): - raise ValueError('groups should be a non-negative integer or ' - 'a list of list of torch.Tensor.') - if isinstance(groups, list): - grouped_parameter_ids = set() - for l in groups: - for p in l: - if not isinstance(p, torch.Tensor): - raise ValueError('groups must consist of torch.Tensor.') - if id(p) in grouped_parameter_ids: - raise ValueError('A parameter can only appear once in groups.') - grouped_parameter_ids.add(id(p)) - self._groups = groups + self._num_groups = num_groups self._p_to_group = {} self._group_counts = {} - - if self.process_set.included() and (size() > 1 or os.environ.get('HOROVOD_ELASTIC') == '1'): + if size() > 1 or os.environ.get('HOROVOD_ELASTIC') == '1': self._register_hooks() def load_state_dict(self, *args, **kwargs): @@ -131,7 +109,8 @@ def set_backward_passes_per_step(self, passes): self._allreduce_delay[p] = self.backward_passes_per_step def _register_hooks(self): - if self._groups is not None: + + if self._num_groups > 0: p_list = [] # Get list of parameters with grads for param_group in self.param_groups: @@ -142,29 +121,16 @@ def _register_hooks(self): # To ensure parameter order and group formation is consistent, broadcast p_list order # from rank 0 and use for every worker p_list_names = [self._parameter_names.get(p) for p in p_list] - p_list_names = broadcast_object(p_list_names, root_rank=0, process_set=self.process_set) - p_list = sorted(p_list, key=lambda p: p_list_names.index(self._parameter_names.get(p))) + p_list_names = broadcast_object(p_list_names, root_rank=0) + p_list = sorted(p_list, key=lambda p : p_list_names.index(self._parameter_names.get(p))) # Form groups - if isinstance(self._groups, list): - p_groups = [] - grouped_id = set() - p_list_ids = [id(p) for p in p_list] - for group in self._groups: - p_groups.append([p for p in group if id(p) in p_list_ids]) - for p in p_groups[-1]: - grouped_id.add(id(p)) - for p in p_list: - if id(p) not in grouped_id: - p_groups.append([p]) - else: - p_groups = split_list(p_list, self._groups) - + p_groups = split_list(p_list, self._num_groups) p_groups = [tuple(p) for p in p_groups] for group in p_groups: - for p in group: - self._p_to_group[p] = group - self._group_counts[group] = 0 + for p in group: + self._p_to_group[p] = group + self._group_counts[group] = 0 for param_group in self.param_groups: for p in param_group['params']: @@ -177,33 +143,16 @@ def _register_hooks(self): self._grad_accs.append(grad_acc) def _allreduce_grad_async(self, p): - if p.grad is None: - # Gradient was not computed, but we still need to submit a tensor to allreduce - # as one of the other ranks may have computed it (due to dynamic forward functions). - # - # NOTE: this will not work if the gradient is sparse and we perform an allgather. - # Unfrotunately, there doesn't appear to be a good way to detect that the parameter will - # produce sparse gradients before computing the gradient. - p.grad = p.data.new(p.size()).zero_() - name = self._parameter_names.get(p) tensor = p.grad - - if p.grad.is_sparse: - if self.sparse_as_dense: - tensor = tensor.to_dense() - else: - return self._sparse_allreduce_grad_async(p, name) - if self._grace and self._groups is None and self.op == Average: - handle, ctx = self._grace.send_step(tensor, name,self._process_set) - postscale_factor = self.gradient_predivide_factor + if self._grace and self._num_groups == 0 and self.op == Average: + handle, ctx = self._grace.send_step(tensor, name) else: - tensor_compressed, ctx = self._compression.compress(tensor) if self.op == Average: - # Split average operation across pre/postscale factors - # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. + # Split average operation across pre/postscale factors + # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. prescale_factor = 1.0 / self.gradient_predivide_factor postscale_factor = self.gradient_predivide_factor else: @@ -211,24 +160,17 @@ def _allreduce_grad_async(self, p): postscale_factor = 1.0 handle = allreduce_async_(tensor_compressed, name=name, op=self.op, - prescale_factor=prescale_factor, - postscale_factor=postscale_factor, - process_set=self.process_set) + prescale_factor=prescale_factor, + postscale_factor=postscale_factor) return handle, ctx def _grouped_allreduce_grad_async(self, ps): name = self._parameter_names.get(ps[0]) tensors_compressed, ctxs = zip(*[self._compression.compress(p.grad) for p in ps]) - handle = grouped_allreduce_async_(tensors_compressed, name=name, op=self.op, - process_set=self.process_set) + handle = grouped_allreduce_async_(tensors_compressed, name=name, op=self.op) return handle, ctxs - def _sparse_allreduce_grad_async(self, p, name): - handle = sparse_allreduce_async(p.grad, name=name, op=self.op, - process_set=self.process_set) - return handle, None - def _make_hook(self, p): def hook(*ignore): if p in self._handles and self._handles[p][0] is not None: @@ -243,7 +185,7 @@ def hook(*ignore): handle, ctx = None, None self._allreduce_delay[p] -= 1 if self._allreduce_delay[p] == 0: - if self._groups is not None: + if self._num_groups > 0: group = self._p_to_group[p] self._group_counts[group] += 1 if self._group_counts[group] == len(group): @@ -260,10 +202,6 @@ def hook(*ignore): return hook def synchronize(self): - if not self.process_set.included(): - self._synchronized = True - return - completed = set() for x in self._handles.keys(): completed.update(x) if isinstance(x, tuple) else completed.add(x) @@ -276,36 +214,23 @@ def synchronize(self): if handle is None: handle, ctx = self._allreduce_grad_async(p) self._handles[p] = (handle, ctx) - for p, (handle, ctx) in self._handles.items(): + for p, (handle, ctx) in self._handles.items(): if isinstance(p, tuple): # This was a grouped result, need to unpack outputs = synchronize(handle) for gp, output, gctx in zip(p, outputs, ctx): self._allreduce_delay[gp] = self.backward_passes_per_step gp.grad.set_(self._compression.decompress(output, gctx)) - if self._groups is not None and self._group_counts[p] != 0: - self._group_counts[p] = 0 else: - # When handle is a callable function, it returns the aggregated tensor result - if self._grace and self._groups is None and self.op == Average: - output = self._grace.receive_step(handle, ctx,self._process_set) - p.grad.set_(output) + if self._grace and self._num_groups == 0 and self.op == Average: + # in GRACE, p is not tuple, but handle is. + output = self._grace.receive_step(handle, ctx) + self._allreduce_delay[p] = self.backward_passes_per_step + p.grad.set_(output) else: - output = synchronize(handle) if not callable(handle) else handle() + output = synchronize(handle) self._allreduce_delay[p] = self.backward_passes_per_step - if self._groups is not None: - group = self._p_to_group[p] - if self._group_counts[group] != 0: - self._group_counts[group] = 0 - if p.grad.is_sparse: - aggregated = self._compression.decompress(output, ctx) - if not aggregated.is_sparse: - # When sparse_as_dense=True we need to convert the grad back to sparse before update - aggregated = aggregated.to_sparse() - - # Sparse grads do not support set_ for some reason, so we do this as an equivalent - p.grad.zero_().add_(aggregated) p.grad.set_(self._compression.decompress(output, ctx)) self._handles.clear() @@ -523,14 +448,13 @@ def zero_grad(self): return super(self.__class__, self).zero_grad() -def DistributedOptimizer(optimizer, named_parameters=None,grace=None, +def DistributedOptimizer(optimizer, + grace=None, named_parameters=None, compression=Compression.none, backward_passes_per_step=1, op=Average, gradient_predivide_factor=1.0, - num_groups=0, groups=None, - sparse_as_dense=False, - process_set=global_process_set): + num_groups=0): """ An optimizer that wraps another torch.optim.Optimizer, using an allreduce to combine gradient values before applying gradients to model weights. @@ -575,18 +499,6 @@ def DistributedOptimizer(optimizer, named_parameters=None,grace=None, gradient_predivide_factor / size after the sum. num_groups: Number of groups to assign gradient allreduce ops to for explicit grouping. Defaults to no explicit groups. - groups: The parameter to group the gradient allreduce ops. Accept values is a - non-negative integer or a list of list of torch.Tensor. - If groups is a non-negative integer, it is the number of groups to assign - gradient allreduce ops to for explicit grouping. - If groups is a list of list of torch.Tensor. Tensors in the same - inner list will be assigned to the same group, while parameter that does - not appear in any list will form a group itself. - Defaults as None, which is no explicit groups. - sparse_as_dense: If set True, convert all sparse gradients to dense and perform allreduce, then - convert back to sparse before applying the update. - process_set: Gradients will only be reduced over Horovod processes belonging - to this process set. Defaults to the global process set. """ # We dynamically create a new class that inherits from the optimizer that was passed in. # The goal is to override the `step()` method with an allreduce implementation. @@ -596,23 +508,12 @@ def DistributedOptimizer(optimizer, named_parameters=None,grace=None, if op != Average: raise ValueError('gradient_predivide_factor not supported with op != Average') - if num_groups != 0: - warnings.warn('Parameter `num_groups` has been replaced by `groups` ' - 'and will be removed in v0.23.0.', DeprecationWarning) - if groups is None: - groups = num_groups - - if backward_passes_per_step <= 0: - raise ValueError("backward_passes_per_step must be > 0") - if op != Adasum or size() == 1: cls = type(optimizer.__class__.__name__, (optimizer.__class__,), dict(_DistributedOptimizer.__dict__)) - return cls(optimizer.param_groups, named_parameters,grace, compression, backward_passes_per_step, op, - gradient_predivide_factor, groups, sparse_as_dense, process_set) + return cls(optimizer.param_groups, named_parameters, grace, compression, backward_passes_per_step, op, + gradient_predivide_factor, num_groups) else: - if process_set != global_process_set: - raise NotImplementedError("Adasum does not support non-global process sets yet.") cls = type(optimizer.__class__.__name__, (optimizer.__class__,), dict(_DistributedAdasumOptimizer.__dict__)) return cls(optimizer.param_groups, named_parameters, compression, backward_passes_per_step) diff --git a/untitled folder/.DS_Store b/untitled folder/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..23ce012392aa198c940bb527f6221020b135256f GIT binary patch literal 6148 zcmeHKKX21O6n~cndLg3nKvBD$#K00nQqiJ{#SP&LU<3o8V8VeL9Ymtu5kVh9;K3|QOV(Cl8L$leYYfP}OJN8T@L>Q;`#0bsz`2h>%Hu3=w?C8G zR{hG=ZMsdH^rbfvGcWh^Y1Z{8&*jq-A$V{x_JgN!H0wJz9*QLQ<0P6Whd7MTR2r2g`LN>+#8_7@_Q79gGZwv`VTPW5NzZTYpPj6? z3;)V49WBMYtwNW+>#a9QxJZT=M=rJrDdsB2Y>lwmzJ?e$<}H&o_cG+HVtLD%t1wx~ zANl&Jnyck3e*Y1D+tFF9c<)&J%H^M}S29|2@FS`+_FG=1W!i z7?AaWVUy?@EHtXpft|VnAl6Z>1jnl8Au_2zbPX07QG-IXE24HK=88eIJL;LSa}5?6 zwL1_M8Q)_f6LUi$Dmv4lxB|UNmkj0Y2$ z5C#Jbd3ZmJ1Ch1GG!7D(>nVp*aVp(vZ8mFdx9YsPyFIV-S+lXbRp+g}o%y`toZGx| z?N0YuG>XLw`t*1Z9v*%tRsX1diLJuTnXDHE

11-z=NRRO3VX}ndjzqkVej;HNl_U-OF#OZ{UEbNN^R27RU)9gplBt?8oYM+kUCQ%d zD#ULcB?nf^`#@-8*~O?5VwMp%bsz%uZ^GC=18$3p08Tqu-B z2UhY4fLO+1Y1pQ>1m$pzuEvEz^q?`7il|bVequ0{j^kX%xf&M=RXQ;J_+WZvre7#b zu8#9_84k==J#@zzUCXgN s!9qrH3xzTTD}5cy23^G$ut-B6rv{>{aiI`L(Ci-pNrSB{1HYAluVJbE9{>OV literal 0 HcmV?d00001 diff --git a/untitled folder/grace_dl/__init__.py b/untitled folder/grace_dl/__init__.py new file mode 100644 index 0000000..ae099b3 --- /dev/null +++ b/untitled folder/grace_dl/__init__.py @@ -0,0 +1 @@ +__version__ = '1.0' diff --git a/untitled folder/grace_dl/dist/.DS_Store b/untitled folder/grace_dl/dist/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..0e5427f8fe4b097ce86acb1b96e871829d3c7ce3 GIT binary patch literal 6148 zcmeHKyG{c^3>-s*BA`e~xxc_4tfKG*`~Z-oKnfhnqoBKv-)8JJ=-@(20ppcDcRtUq zZi@36fXxr9YhVUoN_WJYhq3u{_mQ1s=9Fl$#{tiH#1i+|kE)L+oO?+|hbOKfzr%L5 zTW%h@ZsXMREPBBfYwWnb6`Ya3rZbJ#A@lgb%1Qw#AO)m=6!@nKuxGnXFB&RJ0VyB_ zJ{9oqL!mp?#J*vCIv8REAWoPL<8{mu#Nr8JP3#*oL$gLDHmc=_VU5mw$-0`@H*9oR z4j+~$TTUnzr*r=j<*?dNQ3^_f xHNDps`W^klSR3UG(Ta)Dih1L$_-asB{F={eV&5?6%m4BVYgk literal 0 HcmV?d00001 diff --git a/untitled folder/grace_dl/dist/__init__.py b/untitled folder/grace_dl/dist/__init__.py new file mode 100644 index 0000000..dd04733 --- /dev/null +++ b/untitled folder/grace_dl/dist/__init__.py @@ -0,0 +1,51 @@ +from abc import ABC, abstractmethod + + +class Memory(ABC): + @abstractmethod + def compensate(self, tensor, name): + """Update the tensor with the residuals.""" + raise NotImplemented("compensate was not implemented.") + + def update(self, tensor, name, compressor, tensor_compressed, ctx): + """Update the residuals.""" + pass + + +class Compressor(ABC): + """Interface for compressing and decompressing a given tensor.""" + + def __init__(self, average=True, tensors_size_are_same=True): + self.average = average + self.tensors_size_are_same = tensors_size_are_same + + @abstractmethod + def compress(self, tensor, name): + """Compresses a tensor and returns it with the context needed to decompress it.""" + raise NotImplemented("compress was not implemented.") + + @abstractmethod + def decompress(self, tensors, ctx): + """Decompress the tensor with the given context.""" + raise NotImplemented("decompress was not implemented.") + + def aggregate(self, tensors): + """Aggregate a list of tensors.""" + return sum(tensors) + + +class Communicator(ABC): + @abstractmethod + def send_receive(self, tensors, name, ctx): + raise NotImplemented("send was not implemented.") + + def __init__(self, compressor, memory, world_size): + self.compressor = compressor + self.memory = memory + self.world_size = world_size + + def step(self, tensor, name): + tensor = self.memory.compensate(tensor, name) + tensors_compressed, ctx = self.compressor.compress(tensor, name) + self.memory.update(tensor, name, self.compressor, tensors_compressed, ctx) + return self.send_receive(tensors_compressed, name, ctx) diff --git a/untitled folder/grace_dl/dist/communicator/all_to_all.py b/untitled folder/grace_dl/dist/communicator/all_to_all.py new file mode 100644 index 0000000..c40268a --- /dev/null +++ b/untitled folder/grace_dl/dist/communicator/all_to_all.py @@ -0,0 +1,124 @@ +import torch, math +from torch import distributed as dist + +from grace_dl.dist import Communicator + +from grace_dl.dist.compressor.qsgd import QSGDCompressor +from grace_dl.dist.compressor.qsgd import QSGDCompressor_CUDA +from grace_dl.dist.compressor.terngrad import TernGradCompressor +from grace_dl.dist.compressor.natural import NaturalCompressor +from grace_dl.dist.compressor.natural import NaturalCompressor_CUDA + + +class AllToAll(Communicator): + + def compress_allgather(self, tensors, ctx): + tensors_gathered = [] + for tensor_compressed in tensors: + tensor_list = [torch.empty_like(tensor_compressed) for _ in range(self.world_size)] + dist.all_gather(tensor_list, tensor_compressed) + tensors_gathered.append(tensor_list) + + decompressed_list = [] + for tensors_compressed in zip(*tensors_gathered): + tensor_decompressed = self.compressor.decompress(tensors_compressed, ctx) + decompressed_list.append(tensor_decompressed) + tensors_concat = torch.cat(decompressed_list, dim=0) + return tensors_concat + + def send_receive(self, tensors, name, ctx): + + if isinstance(self.compressor, QSGDCompressor) or isinstance(self.compressor, QSGDCompressor_CUDA): + tensor, norm = tensors + tensor_size = tensor.numel() + world_size = self.world_size + bucket_size = self.compressor.bucket_size + + pad_size = math.ceil(tensor.numel() / (world_size * bucket_size)) * ( + world_size * bucket_size) - tensor.numel() + padding = torch.empty(int(pad_size), dtype=tensor.dtype, layout=tensor.layout, device=tensor.device) + tensor = torch.cat((tensor, padding), dim=0) + + pad_size = tensor.numel() / bucket_size - norm.numel() + padding = torch.empty(int(pad_size), dtype=norm.dtype, layout=norm.layout, device=norm.device) + norm = torch.cat((norm, padding), dim=0) + + data = tensor + input = list(data.chunk(world_size)) + output = list(torch.empty([data.numel()], dtype=data.dtype, device=data.device).chunk(world_size)) + dist.all_to_all(output, input) + tensor_chunk_list = output + + data = norm + input = list(data.chunk(world_size)) + output = list(torch.empty([data.numel()], dtype=data.dtype, device=data.device).chunk(world_size)) + dist.all_to_all(output, input) + norm_chunk_list = output + + compressed_list = [tensor_chunk_list, norm_chunk_list] + decompressed_list = [] + for tensors_compressed in zip(*compressed_list): + chunk_shape = tensors_compressed[0].size() + tensor_decompressed = self.compressor.decompress(tensors_compressed, chunk_shape) + decompressed_list.append(tensor_decompressed) + tensors_aggregated = self.compressor.aggregate(decompressed_list) + + elif isinstance(self.compressor, TernGradCompressor): + tensor, scalar = tensors + tensor_size = tensor.numel() + world_size = self.world_size + + pad_size = math.ceil(tensor.numel() / world_size) * world_size - tensor.numel() + padding = torch.empty(int(pad_size), dtype=tensor.dtype, layout=tensor.layout, device=tensor.device) + tensor = torch.cat((tensor, padding), dim=0) + + data = tensor + input = list(data.chunk(world_size)) + output = list(torch.empty([data.numel()], dtype=data.dtype, device=data.device).chunk(world_size)) + dist.all_to_all(output, input) + tensor_chunk_list = output + + input = [scalar for _ in range(world_size)] + output = [torch.empty_like(scalar, device=scalar.device) for _ in range(world_size)] + dist.all_to_all(output, input) + scalar_chunk_list = output + + compressed_list = [tensor_chunk_list, scalar_chunk_list] + decompressed_list = [] + for tensors_compressed in zip(*compressed_list): + chunk_shape = tensors_compressed[0].size() + tensor_decompressed = self.compressor.decompress(tensors_compressed, chunk_shape) + decompressed_list.append(tensor_decompressed) + tensors_aggregated = self.compressor.aggregate(decompressed_list) + + elif isinstance(self.compressor, NaturalCompressor) or isinstance(self.compressor, NaturalCompressor_CUDA): + tensor = tensors[0] + tensor_size = tensor.numel() + world_size = self.world_size + + pad_size = math.ceil(tensor.numel() / world_size) * world_size - tensor.numel() + padding = torch.empty(int(pad_size), dtype=tensor.dtype, layout=tensor.layout, device=tensor.device) + tensor = torch.cat((tensor, padding), dim=0) + + data = tensor + input = list(data.chunk(world_size)) + output = list(torch.empty([data.numel()], dtype=data.dtype, device=data.device).chunk(world_size)) + dist.all_to_all(output, input) + tensor_chunk_list = output + + compressed_list = [[t, ] for t in tensor_chunk_list] + decompressed_list = [] + for tensors_compressed in compressed_list: + chunk_shape = tensors_compressed[0].size() + tensor_decompressed = self.compressor.decompress(tensors_compressed, chunk_shape) + decompressed_list.append(tensor_decompressed) + tensors_aggregated = self.compressor.aggregate(decompressed_list) + + else: + raise NotImplementedError(self.compressor) + + tensors_compressed, ctx2 = self.compressor.compress(tensors_aggregated, name) + tensors_concat = self.compress_allgather(tensors_compressed, ctx2) + tensors_aggregated = tensors_concat[:tensor_size].view(ctx) + + return (tensors_aggregated / self.world_size) if self.compressor.average else tensors_aggregated diff --git a/untitled folder/grace_dl/dist/communicator/allgather.py b/untitled folder/grace_dl/dist/communicator/allgather.py new file mode 100644 index 0000000..a86f911 --- /dev/null +++ b/untitled folder/grace_dl/dist/communicator/allgather.py @@ -0,0 +1,45 @@ +import torch +from torch import distributed as dist + +from grace_dl.dist import Communicator + + +class Allgather(Communicator): + def send_receive(self, tensors, name, ctx): + if self.compressor.tensors_size_are_same: + tensors_gathered = [] + for tensor_compressed in tensors: + tensor_list = [torch.empty_like(tensor_compressed) for _ in range(self.world_size)] + dist.all_gather(tensor_list, tensor_compressed) + tensors_gathered.append(tensor_list) + else: + local_sizes = torch.tensor([t.numel() for t in tensors]).cuda(tensors[0].device.index) + gathered_sizes = [torch.empty_like(local_sizes) for _ in range(self.world_size)] + dist.all_gather(gathered_sizes, local_sizes) # tensor of tensor sizes per rank + + tensors_gathered = [] + for tensor, sizes in zip(tensors, zip(*gathered_sizes)): + local_size = tensor.numel() + max_size = max(sizes) + gathered = [] + for _ in range(self.world_size): + padded = torch.empty(max_size, dtype=tensor.dtype, layout=tensor.layout, device=tensor.device) + gathered.append(padded) + if local_size != max_size: + padding = torch.empty(max_size - local_size, dtype=tensor.dtype, layout=tensor.layout, + device=tensor.device) + tensor = torch.cat((tensor, padding), dim=0) + dist.all_gather(gathered, tensor) + + data_list = [] + for size, tensor_gathered in zip(sizes, gathered): + data_list.append(tensor_gathered[:size]) + + tensors_gathered.append(data_list) + + decompressed_list = [] + for tensors_compressed in zip(*tensors_gathered): + tensor_decompressed = self.compressor.decompress(tensors_compressed, ctx) + decompressed_list.append(tensor_decompressed) + tensors_aggregated = self.compressor.aggregate(decompressed_list) + return (tensors_aggregated / self.world_size) if self.compressor.average else tensors_aggregated diff --git a/untitled folder/grace_dl/dist/communicator/allreduce.py b/untitled folder/grace_dl/dist/communicator/allreduce.py new file mode 100644 index 0000000..4d8a3d2 --- /dev/null +++ b/untitled folder/grace_dl/dist/communicator/allreduce.py @@ -0,0 +1,13 @@ +from torch import distributed as dist + +from grace_dl.dist import Communicator + + +class Allreduce(Communicator): + + def send_receive(self, tensors, name, ctx): + for tensor_compressed in tensors: + dist.all_reduce(tensor_compressed) + if self.compressor.average: + tensor_compressed.div_(self.world_size) + return self.compressor.decompress(tensors, ctx) diff --git a/untitled folder/grace_dl/dist/communicator/broadcast.py b/untitled folder/grace_dl/dist/communicator/broadcast.py new file mode 100644 index 0000000..a272752 --- /dev/null +++ b/untitled folder/grace_dl/dist/communicator/broadcast.py @@ -0,0 +1,33 @@ +import torch +from torch import distributed as dist + +from grace_dl.dist import Communicator + + +class Broadcast(Communicator): + + def __init__(self, compressor, memory, world_size, rank): + super().__init__(compressor, memory, world_size) + self.rank = rank + + def send_receive(self, tensors, name, ctx): + if not self.compressor.tensors_size_are_same: + raise NotImplemented() + + tensors_decompressed = [] + for root_rank in range(self.world_size): + if root_rank == self.rank: + broadcasted = tensors + for tensor_compressed in tensors: + dist.broadcast(tensor_compressed, root_rank) + else: + broadcasted = [] + for tensor_compressed in tensors: + tensor = torch.empty_like(tensor_compressed) + dist.broadcast(tensor, root_rank) + broadcasted.append(tensor) + tensor_decompressed = self.compressor.decompress(broadcasted, ctx) + tensors_decompressed.append(tensor_decompressed) + + tensor_aggregated = self.compressor.aggregate(tensors_decompressed) + return (tensor_aggregated / self.world_size) if self.compressor.average else tensor_aggregated diff --git a/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat.cpp b/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat.cpp new file mode 100644 index 0000000..10b29ea --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat.cpp @@ -0,0 +1,29 @@ +#include + +#include + +// CUDA forward declarations + +torch::Tensor cnat_compress_cuda(torch::Tensor input); +torch::Tensor cnat_compress_deterministic_cuda(torch::Tensor input); +torch::Tensor cnat_decompress_cuda(torch::Tensor input); + +// C++ interface + +torch::Tensor cnat_compress(torch::Tensor input) { + return cnat_compress_cuda(input); +} + +torch::Tensor cnat_compress_deterministic(torch::Tensor input) { + return cnat_compress_deterministic_cuda(input); +} + +torch::Tensor cnat_decompress(torch::Tensor input) { + return cnat_decompress_cuda(input); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("compress", &cnat_compress, "CNat compress (CUDA)"); + m.def("compress_deterministic", &cnat_compress_deterministic, "CNat compress deterministic (CUDA)"); + m.def("decompress", &cnat_decompress, "CNat decompress (CUDA)"); +} diff --git a/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat_cuda.cu b/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat_cuda.cu new file mode 100644 index 0000000..88cc6c3 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat_cuda.cu @@ -0,0 +1,184 @@ +#include + +#include +#include +#include + +namespace { +__constant__ __device__ uint8_t sign_and_exp_to_encoding[512] = +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, + 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, + 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, + 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, +109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, +123, 124, 125, 126, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, +127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, +127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, +127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, +127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, +127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, +127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, +127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, +127, 127, 127, 127, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, +128, 128, 128, 128, 128, 128, 128, 128, 129, 130, 131, 132, 133, 134, +135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, +149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, +163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, +177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, +191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, +205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, +219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, +233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, +247, 248, 249, 250, 251, 252, 253, 254, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255}; + +__constant__ __device__ uint32_t encoding_to_sign_and_exp[256] = +{ 0, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, + 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, +101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, +115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, +129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, +143, 144, 256, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, +285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, +299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, +313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, +327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, +341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, +355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, +369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, +383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, +397, 398, 399, 400}; + +__global__ void cnat_compress_cuda_kernel( + float* __restrict__ input, + uint8_t* __restrict__ output, + const float* __restrict__ rand, + int64_t len) { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if(index < len) { + if (input[index] == 0) + output[index] = 0; + else { + int exp; + float prob = abs(frexpf(input[index], &exp)) / 0.5 - 1.; // [0.5, 1) -> [0, 1) + if (rand[index] >= prob) exp -= 1; + + if (input[index] < 0) exp += 383; // 256+127 + else exp += 127; + output[index] = sign_and_exp_to_encoding[exp]; + + //exp += 127; + //uint8_t encode; + //if (exp<=17) encode = 0; + //else if (exp<=143) encode = uint8_t(exp-17); + //else encode = 127; + //if (input[index] < 0) encode += 256; + //output[index] = encode; + } + } +} + +__global__ void cnat_compress_deterministic_cuda_kernel( + float* __restrict__ input, + uint8_t* __restrict__ output, + int64_t len) { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if(index < len) { + if (input[index] == 0) + output[index] = 0; + else { + int exp; + float prob = abs(frexpf(input[index], &exp)) / 0.5 - 1.; // [0.5, 1) -> [0, 1) + if (0.5 >= prob) exp -= 1; + + if (input[index] < 0) exp += 383; // 256+127 + else exp += 127; + output[index] = sign_and_exp_to_encoding[exp]; + + //exp += 127; + //uint8_t encode; + //if (exp<=17) encode = 0; + //else if (exp<=143) encode = uint8_t(exp-17); + //else encode = 127; + //if (input[index] < 0) encode += 256; + //output[index] = encode; + } + } +} + +__global__ void cnat_decompress_cuda_kernel( + uint8_t* __restrict__ input, + float* __restrict__ output, + int64_t len) { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if(index < len) { + uint32_t sign_and_exp = encoding_to_sign_and_exp[input[index]] << 23; + output[index] = reinterpret_cast(sign_and_exp); + } +} + +} // namespace + +torch::Tensor cnat_compress_cuda(torch::Tensor input) { + auto output = torch::empty_like(input, torch::TensorOptions().dtype(torch::kUInt8).device(input.device())); + auto rand = torch::rand_like(input, torch::TensorOptions().device(input.device())); // [0, 1) + const int threads = 1024; + int64_t numel = input.numel(); + auto blocks = numel/threads; + if (numel%threads || !blocks) blocks++; + + cnat_compress_cuda_kernel<<>>( + input.data_ptr(), + output.data_ptr(), + rand.data_ptr(), + numel); + + return output; +} + +torch::Tensor cnat_compress_deterministic_cuda(torch::Tensor input) { + auto output = torch::empty_like(input, torch::TensorOptions().dtype(torch::kUInt8).device(input.device())); + const int threads = 1024; + int64_t numel = input.numel(); + auto blocks = numel/threads; + if (numel%threads || !blocks) blocks++; + + cnat_compress_deterministic_cuda_kernel<<>>( + input.data_ptr(), + output.data_ptr(), + numel); + + return output; +} + +torch::Tensor cnat_decompress_cuda(torch::Tensor input) { + auto output = torch::empty_like(input, torch::TensorOptions().dtype(torch::kFloat32).device(input.device())); + const int threads = 1024; + int64_t numel = input.numel(); + auto blocks = numel/threads; + if (numel%threads || !blocks) blocks++; + + cnat_decompress_cuda_kernel<<>>( + input.data_ptr(), + output.data_ptr(), + numel); + + return output; +} + diff --git a/untitled folder/grace_dl/dist/compressor/cnat_cuda/setup.py b/untitled folder/grace_dl/dist/compressor/cnat_cuda/setup.py new file mode 100644 index 0000000..9b564c5 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/cnat_cuda/setup.py @@ -0,0 +1,14 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +setup( + name='cnat_cuda', + ext_modules=[ + CUDAExtension('cnat_cuda', [ + 'cnat.cpp', + 'cnat_cuda.cu', + ]), + ], + cmdclass={ + 'build_ext': BuildExtension + }) diff --git a/untitled folder/grace_dl/dist/compressor/dgc.py b/untitled folder/grace_dl/dist/compressor/dgc.py new file mode 100644 index 0000000..31224d5 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/dgc.py @@ -0,0 +1,50 @@ +import torch + +from grace_dl.dist import Compressor + + +class DgcCompressor(Compressor): + + def __init__(self, compress_ratio): + super().__init__(tensors_size_are_same=False) + self.compress_ratio = compress_ratio + + def compress(self, tensor, name): + shape = tensor.size() + tensor = tensor.flatten() + numel = tensor.numel() + + sample_shape = [max(1, int(numel * 0.01))] + sample_index = torch.empty(sample_shape).uniform_(0, numel).type(torch.long) + sample_tensor = tensor[sample_index] + + k = max(1, int(numel * self.compress_ratio * 0.01)) + vals, indices = torch.topk(sample_tensor.abs(), k) + + thr = vals.min() + mask = tensor.abs() >= thr + selected = mask.sum() + + for _ in range(10): + if selected > 1.3 * numel * self.compress_ratio: + thr = 1.3 * thr + elif selected < 0.7 * numel * self.compress_ratio: + thr = 0.7 * thr + else: + break + mask = tensor.abs() >= thr + selected = mask.sum() + + indices, = torch.where(mask) + values = tensor[indices] + + tensor_compressed = values, indices + ctx = shape, mask, numel + return tensor_compressed, ctx + + def decompress(self, tensor_compressed, ctx): + values, indices = tensor_compressed + shape, _, numel = ctx + tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) + tensor_decompressed.scatter_(0, indices, values) + return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/dist/compressor/efsignsgd.py b/untitled folder/grace_dl/dist/compressor/efsignsgd.py new file mode 100644 index 0000000..443bf06 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/efsignsgd.py @@ -0,0 +1,33 @@ +import torch + +from grace_dl.dist import Compressor + + +class EFSignSGDCompressor(Compressor): + + def __init__(self, lr): + super().__init__(average=False) + self.learning_rate = lr + + def compress(self, tensor, name): + """Encoding and compressing the signs """ + shape = tensor.size() + tensor = tensor.flatten() + + sign_encode = tensor >= 0 + mean = tensor.abs().mean() + tensor_compressed = mean, sign_encode.type(torch.uint8) + + return tensor_compressed, shape + + def decompress(self, tensor_compressed, shape): + """Decoding the signs to float format """ + mean, sign_encode = tensor_compressed + sign_decode = sign_encode.type(torch.float32) * 2 - 1 + sign_decode = mean * sign_decode + tensor_decompressed = sign_decode.view(shape) + return tensor_decompressed + + def aggregate(self, tensors): + """Aggregate a list of tensors.""" + return sum(tensors) / self.learning_rate diff --git a/untitled folder/grace_dl/dist/compressor/fp16.py b/untitled folder/grace_dl/dist/compressor/fp16.py new file mode 100644 index 0000000..1bdd20c --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/fp16.py @@ -0,0 +1,22 @@ +import torch + +from grace_dl.dist import Compressor + + +class FP16Compressor(Compressor): + """Compress all floating point gradients to 16-bit.""" + + def compress(self, tensor, name): + """Downcasts the tensor to 16-bit.""" + dtype = tensor.dtype + if dtype.is_floating_point: + # Only allow compression from other floating point types + tensor = tensor.type(torch.float16) + return [tensor], dtype + + def decompress(self, tensors, dtype): + """Upcasts the tensor to the initialization dtype.""" + tensor_decompressed, = tensors + if dtype.is_floating_point: + tensor_decompressed = tensor_decompressed.type(dtype) + return tensor_decompressed diff --git a/untitled folder/grace_dl/dist/compressor/natural.py b/untitled folder/grace_dl/dist/compressor/natural.py new file mode 100644 index 0000000..0dd29ec --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/natural.py @@ -0,0 +1,57 @@ +# conda install -c conda-forge cupy=7.0.0=py37h0c141eb_2 +from torch.utils.dlpack import to_dlpack +from torch.utils.dlpack import from_dlpack + +from grace_dl.torch import Compressor + + +class NaturalCompressor(Compressor): + def __init__(self): + super().__init__() + + def compress(self, tensor, name): + import cupy + shape = tensor.size() + tensor_flatten = tensor.flatten() + cupy_tensor = cupy.fromDlpack(to_dlpack(tensor_flatten)) + tensor_cast = cupy_tensor.view(cupy.int32) + sign = tensor_cast & cupy.int32(0b10000000000000000000000000000000) + exp = tensor_cast & cupy.int32(0b01111111100000000000000000000000) + mantissa = tensor_cast & cupy.int32(0b00000000011111111111111111111111) + exp_add_one = mantissa > cupy.random.randint(low=0, high=0b00000000011111111111111111111111, + size=cupy_tensor.shape, + dtype=cupy.int32) + exponent = cupy.where(exp_add_one, exp + 0b00000000100000000000000000000000, exp) + exp_shift = cupy.clip(exponent, a_min=0b00001001000000000000000000000000, a_max=0b01001000100000000000000000000000) + exps = cupy.right_shift(exp_shift, 23) + exps = cupy.bitwise_or(cupy.right_shift(sign, 24), exps - 18) + tensor_compressed = exps.astype(cupy.uint8) + return [from_dlpack(tensor_compressed.toDlpack())], shape + + def decompress(self, tensor_compressed, shape): + import cupy + tensor_compressed, = tensor_compressed + cupy_tensor = cupy.fromDlpack(to_dlpack(tensor_compressed)) + sign = cupy_tensor > 127 + exps = cupy.bitwise_and(cupy_tensor, 0b01111111) + floats = cupy.left_shift((exps + 18).astype(cupy.int32), 23).view(cupy.float32) + tensor_decompressed = cupy.where(sign, -floats, floats) + tensor_decompressed = cupy.multiply((exps >= 1).astype(cupy.float32), tensor_decompressed) + return from_dlpack(tensor_decompressed.toDlpack()).view(shape) + + +class NaturalCompressor_CUDA(Compressor): + def __init__(self): + super().__init__() + + def compress(self, tensor, name): + import cnat_cuda + shape = tensor.size() + tensor_compressed = cnat_cuda.compress(tensor.flatten()) + return [tensor_compressed, ], shape + + def decompress(self, tensor_compressed, shape): + import cnat_cuda + tensor_compressed, = tensor_compressed + tensor_decompressed = cnat_cuda.decompress(tensor_compressed) + return tensor_decompressed.view(shape) \ No newline at end of file diff --git a/untitled folder/grace_dl/dist/compressor/none.py b/untitled folder/grace_dl/dist/compressor/none.py new file mode 100644 index 0000000..02d7bb0 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/none.py @@ -0,0 +1,12 @@ +from grace_dl.dist import Compressor + + +class NoneCompressor(Compressor): + """Default no-op compression.""" + + def compress(self, tensor, name): + return [tensor], None + + def decompress(self, tensors, ctx): + tensor, = tensors + return tensor diff --git a/untitled folder/grace_dl/dist/compressor/onebit.py b/untitled folder/grace_dl/dist/compressor/onebit.py new file mode 100644 index 0000000..5558b18 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/onebit.py @@ -0,0 +1,31 @@ +import torch + +from grace_dl.dist import Compressor + + +class OneBitCompressor(Compressor): + + def compress(self, tensor, name): + shape = tensor.size() + tensor = tensor.flatten() + numel = tensor.numel() + + mask0 = tensor < 0 + sum0 = torch.sum(tensor[mask0]) + num0 = torch.sum(mask0).float() + mean0 = sum0 / num0 if num0 > 0 else sum0 + + mask1 = ~mask0 + sum1 = torch.sum(tensor[mask1]) + num1 = numel - num0 + mean1 = sum1 / num1 if num1 > 0 else sum1 + + tensor_compressed = mask0.type(torch.uint8), mean0, mean1 + + return tensor_compressed, shape + + def decompress(self, tensor_compressed, shape): + mask0, mean0, mean1 = tensor_compressed + tensor_decompressed = mask0 * mean0 + ~mask0 * mean1 + tensor_decompressed = tensor_decompressed.view(shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/dist/compressor/powersgd.py b/untitled folder/grace_dl/dist/compressor/powersgd.py new file mode 100644 index 0000000..3912195 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/powersgd.py @@ -0,0 +1,65 @@ +import torch +from torch import distributed as dist + +from grace_dl.dist import Compressor + + +@torch.jit.script +def orthogonalize(matrix): + n, m = matrix.shape + for i in range(m): + # Normalize the i'th column + col = matrix[:, i: i + 1] + col /= torch.sqrt(torch.sum(col ** 2)) + # Project it on the rest and remove it + if i + 1 < m: + rest = matrix[:, i + 1:] + # rest -= torch.matmul(col.t(), rest) * col + rest -= torch.sum(col * rest, dim=0) * col + + +class PowerSGDCompressor(Compressor): + + def __init__(self, rank=1, use_memory=False, world_size=1): + super().__init__() + self.world_size = world_size + self.q_memory = {} + self.rank = rank + self.use_memory = use_memory + + def compress(self, tensor, name): + if tensor.dim() == 1: + return [tensor], None + + shape = tensor.size() + matrix = tensor.view([shape[0], -1]) + n, m = matrix.size() + r = min(n, m, self.rank) + if self.use_memory and name in self.q_memory: + q = self.q_memory[name] + else: + q = torch.empty(m, r, dtype=matrix.dtype, layout=matrix.layout, device=matrix.device).normal_() + # q, _ = torch.qr(q) + orthogonalize(q) + + p = torch.mm(matrix, q) + dist.all_reduce(p) + p = p / self.world_size + # p, _ = torch.qr(p) + orthogonalize(p) + q = torch.mm(matrix.t(), p) + dist.all_reduce(q) + q = q / self.world_size + ctx = p, q, shape + if self.use_memory: + self.q_memory[name] = q + return [], ctx + + def decompress(self, tensors, ctx): + if ctx is None: + tensor, = tensors + return tensor + p, q, tensor_shape = ctx + new_tensor = torch.mm(p, q.t()) + tensor_decompressed = new_tensor.view(tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/dist/compressor/qsgd.py b/untitled folder/grace_dl/dist/compressor/qsgd.py new file mode 100644 index 0000000..7922bf8 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/qsgd.py @@ -0,0 +1,79 @@ +import torch +from grace_dl.dist import Compressor + + +class QSGDCompressor(Compressor): + + def __init__(self, quantum_num, bucket_size=128): + super().__init__() + self.quantum_num = quantum_num + self.bucket_size = bucket_size + + def compress(self, tensor, name): + shape = tensor.size() + tensor = tensor.flatten() + abs_gradient = tensor.abs() + + if tensor.numel() % self.bucket_size != 0: + pad_size = self.bucket_size - tensor.numel() % self.bucket_size + pad_tensor = torch.cat([tensor, torch.zeros(pad_size, dtype=tensor.dtype, device=tensor.device)]) + else: + pad_tensor = tensor + pad_tensor = pad_tensor.view([-1, self.bucket_size]) + pad_tensor_sqsum = torch.sum(pad_tensor ** 2, dim=1) + bucket_norm = torch.sqrt(pad_tensor_sqsum) + b = torch.ones([1, self.bucket_size], device=tensor.device) + expand_norm = torch.matmul(bucket_norm.view([-1, 1]), b) + norm = expand_norm.flatten()[:tensor.numel()] + + level_float = self.quantum_num / norm * abs_gradient + previous_level = level_float.floor() + prob = torch.empty_like(tensor).uniform_() + is_next_level = (prob < (level_float - previous_level)).type(torch.float32) + new_level = (previous_level + is_next_level) + + sign = tensor.sign() + tensor_compressed = (new_level * sign).type(torch.int16) + tensor_compressed = tensor_compressed.type(torch.int8 if self.quantum_num < 128 else torch.half) + + return (tensor_compressed, bucket_norm), shape + + def decompress(self, tensor_compressed, ctx): + tensor_compressed, bucket_norm = tensor_compressed + shape = ctx + b = torch.ones([1, self.bucket_size], device=tensor_compressed.device) + expand_norm = torch.matmul(bucket_norm.view([-1, 1]), b) + norm = expand_norm.flatten()[:shape.numel()] + decode_output = tensor_compressed.type(torch.float32) + tensor_decompressed = norm / self.quantum_num * decode_output + tensor_decompressed = tensor_decompressed.view(shape) + + return tensor_decompressed + + +class QSGDCompressor_CUDA(Compressor): + + def __init__(self, quantum_num, bucket_size=128): + super().__init__() + self.quantum_num = quantum_num + self.bucket_size = bucket_size + + def compress(self, tensor, name): + import qsgd_cuda + shape = tensor.size() + tensor = tensor.flatten() + + tensor_compressed, bucket_norm = qsgd_cuda.compress(tensor, self.quantum_num, self.bucket_size) + tensor_compressed = tensor_compressed, bucket_norm.float() + + return tensor_compressed, shape + + def decompress(self, tensor_compressed, shape): + import qsgd_cuda + tensor_compressed, bucket_norm = tensor_compressed + + tensor_decompressed = qsgd_cuda.decompress(tensor_compressed, bucket_norm.double(), self.quantum_num, self.bucket_size) + + return tensor_decompressed.view(shape) + + diff --git a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/example.py b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/example.py new file mode 100644 index 0000000..6814041 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/example.py @@ -0,0 +1,16 @@ +# Compilation cmd: python setup.py install +# Alternative JIT compilation, refer to https://pytorch.org/tutorials/advanced/cpp_extension.html#jit-compiling-extensions +# from torch.utils.cpp_extension import load +# qsgd_cuda = load(name="qsgd_cuda", sources=["qsgd.cpp", "qsgd_cuda.cu"], verbose=True) + +import torch +import qsgd_cuda + +t = torch.rand(200000, dtype=torch.float32, device='cuda') +print(t) + +out, scaler = qsgd_cuda.compress(t, 127, 128) +print(out, scaler) + +out2 = qsgd_cuda.decompress(out, scaler, 127, 128) +print(out2) \ No newline at end of file diff --git a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd.cpp b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd.cpp new file mode 100644 index 0000000..6f11f5e --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd.cpp @@ -0,0 +1,24 @@ +#include +#include + +// CUDA forward declarations + +std::vector qsgd_compress_cuda(torch::Tensor input, int level, int bucket_size); + +torch::Tensor qsgd_decompress_cuda(torch::Tensor input,torch::Tensor scaler, int level, int bucket_size); + + +// C++ interface +std::vector qsgd_compress(torch::Tensor input, int level, int bucket_size) { + return qsgd_compress_cuda(input, level, bucket_size); +} + +torch::Tensor qsgd_decompress(torch::Tensor input,torch::Tensor scaler, int level, int bucket_size) { + return qsgd_decompress_cuda(input, scaler, level, bucket_size); +} + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("compress", &qsgd_compress, "QSGD compression"); + m.def("decompress", &qsgd_decompress, "QSGD decompression"); +} diff --git a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd_cuda.cu b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd_cuda.cu new file mode 100644 index 0000000..16cac6c --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd_cuda.cu @@ -0,0 +1,538 @@ +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace std; + +#define EIGEN_USE_GPU +#define maxThreadsPerBlock 1024 + +__global__ void _qsgdreduceSumV2(float *g_odata, float *g_idata, unsigned int n) +{ + extern __shared__ float sdata[]; + + // set thread ID + unsigned int tid = threadIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + unsigned int i = blockIdx.x * blockDim.x + tid; + unsigned int blockSize = blockDim.x; + + sdata[tid] = 0; + + while (i < n) { + sdata[tid] += g_idata[i];// + g_idata[i + blockDim.x]; + i += gridSize; + } + __syncthreads(); + + // in-place reduction and complete unroll + if (blockSize >= 1024) { + if (tid < 512) sdata[tid] += sdata[tid + 512]; + __syncthreads(); + } + if (blockSize >= 512) { + if (tid < 256) sdata[tid] += sdata[tid + 256]; + __syncthreads(); + } + if (blockSize >= 256) { + if (tid < 128) sdata[tid] += sdata[tid + 128]; + __syncthreads(); + } + if (blockSize >= 128) { + if (tid < 64) sdata[tid] += sdata[tid + 64]; + __syncthreads(); + } + + // unrolling warp + if (tid < 32) + { + volatile float *vsmem = sdata; + vsmem[tid] += vsmem[tid + 32]; + vsmem[tid] += vsmem[tid + 16]; + vsmem[tid] += vsmem[tid + 8]; + vsmem[tid] += vsmem[tid + 4]; + vsmem[tid] += vsmem[tid + 2]; + vsmem[tid] += vsmem[tid + 1]; + } + + // write result for this block to global mem + if (tid == 0) { + g_odata[blockIdx.x] = sdata[0]; + } +} + +__global__ void _qsgdreduceClipThresholdV2(float *g_odata, float *g_idata, unsigned int n) +{ + extern __shared__ float sdata[]; + + // set thread ID + unsigned int tid = threadIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int blockSize = blockDim.x; + + sdata[tid] = 0; + + while (i < n) { + if (isfinite(g_idata[i])) { + sdata[tid] += g_idata[i] * g_idata[i];// + g_idata[i + blockDim.x] * g_idata[i + blockDim.x]; + } + i += gridSize; + } + __syncthreads(); + + // in-place reduction and complete unroll + if (blockSize >= 1024) { + if (tid < 512) sdata[tid] += sdata[tid + 512]; + __syncthreads(); + } + if (blockSize >= 512) { + if (tid < 256) sdata[tid] += sdata[tid + 256]; + __syncthreads(); + } + if (blockSize >= 256) { + if (tid < 128) sdata[tid] += sdata[tid + 128]; + __syncthreads(); + } + if (blockSize >= 128) { + if (tid < 64) sdata[tid] += sdata[tid + 64]; + __syncthreads(); + } + + // unrolling warp + if (tid < 32) + { + volatile float *vsmem = sdata; + vsmem[tid] += vsmem[tid + 32]; + vsmem[tid] += vsmem[tid + 16]; + vsmem[tid] += vsmem[tid + 8]; + vsmem[tid] += vsmem[tid + 4]; + vsmem[tid] += vsmem[tid + 2]; + vsmem[tid] += vsmem[tid + 1]; + } + + // write result for this block to global mem + if (tid == 0) { + g_odata[blockIdx.x] = sdata[0]; + } +} + +__global__ void _qsgdreduceAbsMaxV2(float *g_odata, float *g_idata, unsigned int n) +{ + extern __shared__ float sdata[]; + + // set thread ID + unsigned int tid = threadIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int blockSize = blockDim.x; + + sdata[tid] = 0; + + while (i < n) { + if (isfinite(g_idata[i]) && isfinite(sdata[tid])) + sdata[tid] = fmaxf(sdata[tid], fabsf(g_idata[i])); //fmaxf(fabsf(g_idata[i]), fabsf(g_idata[i + blockDim.x]))); + else + sdata[tid] = nanf("123"); + i += gridSize; + } + __syncthreads(); + + // in-place reduction and complete unroll + if (blockSize >= 1024) { + if (tid < 512) { + if (isfinite(sdata[tid]) && isfinite(sdata[tid + 512])) sdata[tid] = fmaxf(sdata[tid], sdata[tid + 512]); + else sdata[tid] = nanf("123"); + } + __syncthreads(); + } + if (blockSize >= 512) { + if (tid < 256) { + if (isfinite(sdata[tid]) && isfinite(sdata[tid + 256])) sdata[tid] = fmaxf(sdata[tid], sdata[tid + 256]); + else sdata[tid] = nanf("123"); + } + __syncthreads(); + } + if (blockSize >= 256) { + if (tid < 128) { + if (isfinite(sdata[tid]) && isfinite(sdata[tid + 128])) sdata[tid] = fmaxf(sdata[tid], sdata[tid + 128]); + else sdata[tid] = nanf("123"); + } + __syncthreads(); + } + if (blockSize >= 128) { + if (tid < 64) { + if (isfinite(sdata[tid]) && isfinite(sdata[tid + 64])) sdata[tid] = fmaxf(sdata[tid], sdata[tid + 64]); + else sdata[tid] = nanf("123"); + } + __syncthreads(); + } + + // unrolling warp + if (tid < 32) + { + volatile float *vsmem = sdata; + if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 32])) + vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 32]); + else vsmem[tid] = nanf("123"); + + if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 16])) + vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 16]); + else vsmem[tid] = nanf("123"); + + if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 8])) + vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 8]); + else vsmem[tid] = nanf("123"); + + if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 4])) + vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 4]); + else vsmem[tid] = nanf("123"); + + if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 2])) + vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 2]); + else vsmem[tid] = nanf("123"); + + if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 1])) + vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 1]); + else vsmem[tid] = nanf("123"); + + } + + // write result for this block to global mem + if (tid == 0) { + g_odata[blockIdx.x] = sdata[0]; + } +} + +__global__ void _qsgdcomputeSqrt(float *scaler) +{ + *scaler = sqrt(*scaler); + //printf("l2 norm result: %f\n", *scaler); + //__syncthreads(); +} + +__global__ void _qsgdinitCURand(unsigned int len, unsigned int seed, curandState* states) +{ + unsigned int index = threadIdx.x + blockIdx.x * blockDim.x; + /* we have to initialize the state */ + if (index < len) + curand_init(seed + index, /* the seed can be the same for each core, here we pass the time in from the CPU */ + 0, /* the sequence number should be different for each core (unless you want all + cores to get the same sequence of numbers for some reason - use thread id! */ + 0, /* the offset is how much extra we advance in the sequence for each call, can be 0 */ + &states[index]); +} + +__global__ void _qsgdcompensateMemory(float *dst, const float *src, const float *local_mem, int len) +{ + unsigned int index = threadIdx.x + blockIdx.x * blockDim.x; + unsigned int stride = gridDim.x * blockDim.x; + + for (int i = index; i < len; i += stride){ + if (isfinite(src[i])) { + //dst[i] = src[i]; // + local_mem[i]; //remove memory compensation for comparison purposes. + dst[i] = src[i] + local_mem[i]; + } + else { + dst[i] = nanf("123"); + } + //printf("CompensateMemory result: idx=%d, src=%f, mem=%f, dst=%f\n", i, src[i], local_mem[i], dst[i]); + //__syncthreads(); + } +} + +__global__ void _qsgdTernarizeValue(int8_t *dst, const float *src, float *scaler, float *local_mem, const int len, int level, curandState* states) +{ + unsigned int index = threadIdx.x + blockIdx.x * blockDim.x; + unsigned int stride = gridDim.x * blockDim.x; + curandState local_state = states[index]; + float norm_scaler = *scaler; + + // The input tensor here has been clipped. + // Hence we have the ternarize formula: dst[i] = new_level[i] * sign(src[i]) + for (int i = index; i < len; i += stride) { + if (isfinite(norm_scaler) && isfinite(src[i])) { + float rand_sample = curand_uniform(&local_state); + float level_float = (float)level / norm_scaler * fabsf(src[i]); + int8_t previous_level = floor(level_float); + if (rand_sample < level_float - previous_level) { + dst[i] = previous_level + 1; // 1 is required by qsgd + } + else { + dst[i] = previous_level; + } + if (src[i] < 0){ + dst[i] = -dst[i]; + } + + // update local memory + local_mem[i] = src[i] - norm_scaler / (float)level * (float)dst[i]; // remove vanilla local memory update for comparison purposes. + } + else { + // encode value to the minimum for Inf or NaN + dst[i] = -128; + } + //printf("compressed result: idx=%d, scaler=%f, src=%f, dst=%d, update_mem=%f\n", i, *scaler, src[i], dst[i], local_mem[i]); + //__syncthreads(); + } +} + + +// For qsgd allreduce +// __global__ void _qsgdDeternarizeValue(int len, float *dst, int8_t *src, float *scaler, int level) +// { +// int index = blockIdx.x * blockDim.x + threadIdx.x; +// int stride = blockDim.x * gridDim.x; +// float norm_scaler = *scaler; + +// for (int i = index; i < len; i += stride) +// { +// dst[i] = norm_scaler / (float)level * (float)src[i]; +// } +// } + + +// For qsgd allgather +__global__ void _qsgdDeternarizeAndAdd(int len, float *dst, int8_t *src, float *scaler, int level) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + float norm_scaler = *scaler; + + for (int i = index; i < len; i += stride) { + if (src[i] == -128) { + dst[i] = nanf("123"); + } + else { + dst[i] += norm_scaler / (float)level * (float)src[i]; + } + //printf("decompressed result: idx=%d, scaler=%f, src=%d, dst=%f\n", i, *scaler, src[i], dst[i]); + //__syncthreads(); + } +} + +__global__ void _bucket_l2norm(const int len, double *dst, float *src, const int bucket_size) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + const int loop_times = len / bucket_size; + const int remain_nums = len % bucket_size; + + for (int i = index; i < loop_times; i += stride) + { +#pragma unroll + for (int j = 0; j < bucket_size; j ++){ + if (isfinite(src[bucket_size*i+j])) { + dst[i] += (double)(src[bucket_size*i+j]) * (double)(src[bucket_size*i+j]); + } + } + dst[i] = sqrt(dst[i]); + } + if (remain_nums && index == loop_times){ +#pragma unroll + for (int i = 0; i < remain_nums; i++){ + if (isfinite(src[bucket_size*loop_times+i])) { + dst[loop_times] += (double)(src[bucket_size*loop_times+i]) * (double)(src[bucket_size*loop_times+i]); + } + } + dst[loop_times] = sqrt(dst[loop_times]); + } + +} + + + +__global__ void _bucket_qsgdTernarizeValue(int8_t *dst, const float *src, double *scaler, const int len, int level, const int bucket_size, unsigned int seed) +{ + unsigned int index = threadIdx.x + blockIdx.x * blockDim.x; + unsigned int stride = gridDim.x * blockDim.x; + // curandState local_state = states[index]; + curandState local_state; + + + // The input tensor here has been clipped. + // Hence we have the ternarize formula: dst[i] = new_level[i] * sign(src[i]) + for (int i = index; i < len; i += stride) { + float norm_scaler = (float)(scaler[i/bucket_size]); + curand_init(seed + index, 0, 0, &local_state); + if (isfinite(norm_scaler) && isfinite(src[i])) { + float rand_sample = curand_uniform(&local_state); + float level_float = (float)level / norm_scaler * fabsf(src[i]); + int8_t previous_level = floor(level_float); + if (rand_sample < level_float - previous_level) { + dst[i] = previous_level + 1; // 1 is required by qsgd + } + else { + dst[i] = previous_level; + } + if (src[i] < 0){ + dst[i] = -dst[i]; + } + + // update local memory + //local_mem[i] = src[i] - norm_scaler / (float)level * (float)dst[i]; // remove vanilla local memory update for comparison purposes. + } + else { + // encode value to the minimum for Inf or NaN + dst[i] = -128; + } + //printf("compressed result: idx=%d, scaler=%f, src=%f, dst=%d, update_mem=%f\n", i, *scaler, src[i], dst[i], local_mem[i]); + //__syncthreads(); + } +} + +// For qsgd allgather +__global__ void _bucket_qsgdDeternarizeAndAdd(int len, float *dst, int8_t *src, double *scaler, int level, const int bucket_size) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i = index; i < len; i += stride) { + float norm_scaler = (float)(scaler[i/bucket_size]); + if (src[i] == -128) { + dst[i] = nanf("123"); + } + else { + dst[i] = norm_scaler / (float)level * (float)src[i]; + //atomicAdd(dst+i, norm_scaler / (float)level * (float)src[i]); + } + //printf("decompressed result: idx=%d, scaler=%f, src=%d, dst=%f\n", i, *scaler, src[i], dst[i]); + //__syncthreads(); + } +} + + +/*----------------------------------- Reduce Wrapper --------------------------------------------*/ +void qsgdGPUReduce(int len, float *d_out, float *d_intermediate_res, float *result, int whichKernel, cudaStream_t stream) { + // d_intermediate_res holds the input + // setting up blocks + int numBlocks = (int) ceil(1.0 * len / maxThreadsPerBlock); //(len / maxThreadsPerBlock) + 1; + + int prevNumBlocks = len; + // recursively reduce to get the result + while (numBlocks > maxThreadsPerBlock) { + // clear d_out + cudaMemset(d_out, 0, numBlocks * sizeof(float)); + + switch (whichKernel) { + // reduce sum + case 0: + _qsgdreduceSumV2<<>>(d_out, d_intermediate_res, len); + break; + // reduce absmax + case 1: + _qsgdreduceAbsMaxV2<<>>(d_out, d_intermediate_res, len); + break; + // reduce clip threshold + case 2: + _qsgdreduceClipThresholdV2<<>>(d_out, d_intermediate_res, len); + // we don't need to square the intermediate results. + whichKernel = 0; + break; + default: + break; + } + + // by now, d_out holds the intermediate result, copy it to intermedaite_res for the next run + cudaMemcpy(d_intermediate_res, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToDevice); + // compute reduced problem size + prevNumBlocks = numBlocks; + len = numBlocks; + numBlocks = (int) ceil(1.0 * numBlocks / maxThreadsPerBlock); //numBlocks / maxThreadsPerBlock + 1; + } + + // use one block to compute the rest. + // clear d_out + cudaMemset(d_out, 0, prevNumBlocks* sizeof(float)); + switch (whichKernel) { + // reduce sum + case 0: + _qsgdreduceSumV2<<<1, maxThreadsPerBlock, maxThreadsPerBlock * sizeof(float)>>>(d_out, d_intermediate_res, prevNumBlocks); + break; + // reduce absmax + case 1: + _qsgdreduceAbsMaxV2<<<1, maxThreadsPerBlock, maxThreadsPerBlock * sizeof(float)>>>(d_out, d_intermediate_res, prevNumBlocks); + break; + // reduce clip threshold + case 2: + _qsgdreduceClipThresholdV2<<<1, maxThreadsPerBlock, maxThreadsPerBlock * sizeof(float)>>>(d_out, d_intermediate_res, prevNumBlocks); + break; + default: + break; + } + // as we just use one block, just move the first element of d_out to result + cudaMemcpy(result, d_out, sizeof(float), cudaMemcpyDeviceToDevice); +} + +/*----------------------------------- Kernel Launch Wrappers ------------------------------------*/ + + +void GPUReduceL2Norm(float *array, int len, double *l2norm_scaler, const int bucket_size) +{ + int blocksPerGrid = (int) ceil(1.0 * len / maxThreadsPerBlock); + _bucket_l2norm<<>>(len, l2norm_scaler, array, bucket_size); +} + +// void qsgdGPUInit_curand(int n, unsigned int seed, curandState* cuda_states) +// { +// int blocksPerGrid = (int) ceil(1.0 * n / maxThreadsPerBlock); +// _qsgdinitCURand<<>>(n, seed, cuda_states); +// } + +// void qsgdGPUCompensateMemory(float *dst, const float *src, const float* local_mem, int len) +// { +// int blocksPerGrid = (int) ceil(1.0 * len / maxThreadsPerBlock); +// _qsgdcompensateMemory<<>>(dst, src, local_mem, len); +// } + + + +void GPUTernarizeMultiLevelValue(int8_t *dst, const float *src, double *scaler, int len, int level, const int bucket_size) +{ + int blocksPerGrid = (int) ceil(1.0 * std::min(len, 1024 * 1024 * 25) / maxThreadsPerBlock); + unsigned int seed = time(NULL); + _bucket_qsgdTernarizeValue<<>>(dst, src, scaler, len, level, bucket_size, seed); +} + +void GPUDeternarizeMultiLevelValue(int len, float *dst, int8_t *src, double *scaler, int level, const int bucket_size) +{ + int blocksPerGrid = (int) ceil(1.0 * len / maxThreadsPerBlock); + _bucket_qsgdDeternarizeAndAdd<<>>(len, dst, src, scaler, level, bucket_size); +} + + + + +std::vector qsgd_compress_cuda(torch::Tensor input, int level, int bucket_size) { + + int element_nums = input.numel(); + int num_buckets = ceil((float)element_nums / bucket_size); + auto d_l2norm_scaler = torch::zeros(num_buckets, torch::TensorOptions().dtype(torch::kFloat64).device(input.device())); + auto buffer_data = torch::empty(element_nums, torch::TensorOptions().dtype(torch::kInt8).device(input.device())); + +// curandState* cuda_states; +// cuda_states = (curandState*)torch::empty(element_nums, torch::TensorOptions().dtype(torch::kInt).device(input.device())).data_ptr(); +// qsgdGPUInit_curand(element_nums, time(NULL), cuda_states); + + GPUReduceL2Norm((float*)input.data_ptr(), element_nums, (double*)d_l2norm_scaler.data_ptr(), bucket_size); + GPUTernarizeMultiLevelValue((int8_t*)buffer_data.data_ptr(), (float*)input.data_ptr(), (double*)d_l2norm_scaler.data_ptr(), + element_nums, level, bucket_size); + + return {buffer_data, d_l2norm_scaler}; +} + + +torch::Tensor qsgd_decompress_cuda(torch::Tensor input, torch::Tensor d_l2norm_scaler, int level, int bucket_size) { + int element_nums = input.numel(); + int num_buckets = ceil((float)element_nums / bucket_size); + auto buffer_data = torch::empty(element_nums, torch::TensorOptions().dtype(torch::kFloat32).device(input.device())); + GPUDeternarizeMultiLevelValue(element_nums, (float*)buffer_data.data_ptr(), (int8_t*)input.data_ptr(), + (double*)d_l2norm_scaler.data_ptr(), level, bucket_size); + return buffer_data; +} \ No newline at end of file diff --git a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/setup.py b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/setup.py new file mode 100644 index 0000000..9542698 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/setup.py @@ -0,0 +1,14 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +setup( + name='qsgd_cuda', + ext_modules=[ + CUDAExtension('qsgd_cuda', [ + 'qsgd.cpp', + 'qsgd_cuda.cu', + ]), + ], + cmdclass={ + 'build_ext': BuildExtension + }) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_histogram.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_histogram.cuh new file mode 100644 index 0000000..3b6cc4c --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_histogram.cuh @@ -0,0 +1,787 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . + */ + +#pragma once + +#include + +#include "../util_type.cuh" +#include "../block/block_load.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy + ******************************************************************************/ + +/** + * + */ +enum BlockHistogramMemoryPreference +{ + GMEM, + SMEM, + BLEND +}; + + +/** + * Parameterizable tuning policy type for AgentHistogram + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming + BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) + bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue +struct AgentHistogramPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) + IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming + MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) + IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . + */ +template < + typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type + int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory. + int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels. + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SampleIteratorT, ///< Random-access input iterator type for reading samples + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel + typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel + typename OffsetT, ///< Signed integer type for global offsets + int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability +struct AgentHistogram +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The sample type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + /// The pixel type of SampleT + typedef typename CubVector::Type PixelT; + + /// The quad type of SampleT + typedef typename CubVector::Type QuadT; + + /// Constants + enum + { + BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS, + + PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD, + SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS, + QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4, + + TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS, + TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS, + + IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS, + + MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? + AgentHistogramPolicyT::MEM_PREFERENCE : + GMEM, + + IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING, + }; + + /// Cache load modifier for reading input elements + static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER; + + + /// Input iterator wrapper type (for applying cache modifier) + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + SampleIteratorT>::Type // Directly use the supplied input iterator type + WrappedSampleIteratorT; + + /// Pixel input iterator type (for applying cache modifier) + typedef CacheModifiedInputIterator + WrappedPixelIteratorT; + + /// Qaud input iterator type (for applying cache modifier) + typedef CacheModifiedInputIterator + WrappedQuadIteratorT; + + /// Parameterized BlockLoad type for samples + typedef BlockLoad< + SampleT, + BLOCK_THREADS, + SAMPLES_PER_THREAD, + AgentHistogramPolicyT::LOAD_ALGORITHM> + BlockLoadSampleT; + + /// Parameterized BlockLoad type for pixels + typedef BlockLoad< + PixelT, + BLOCK_THREADS, + PIXELS_PER_THREAD, + AgentHistogramPolicyT::LOAD_ALGORITHM> + BlockLoadPixelT; + + /// Parameterized BlockLoad type for quads + typedef BlockLoad< + QuadT, + BLOCK_THREADS, + QUADS_PER_THREAD, + AgentHistogramPolicyT::LOAD_ALGORITHM> + BlockLoadQuadT; + + /// Shared memory type required by this thread block + struct _TempStorage + { + CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding) + + int tile_idx; + + // Aliasable storage layout + union Aliasable + { + typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples + typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels + typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads + + } aliasable; + }; + + + /// Temporary storage type (unionable) + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to temp_storage + _TempStorage &temp_storage; + + /// Sample input iterator (with cache modifier applied, if possible) + WrappedSampleIteratorT d_wrapped_samples; + + /// Native pointer for input samples (possibly NULL if unavailable) + SampleT* d_native_samples; + + /// The number of output bins for each channel + int (&num_output_bins)[NUM_ACTIVE_CHANNELS]; + + /// The number of privatized bins for each channel + int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS]; + + /// Reference to gmem privatized histograms for each channel + CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS]; + + /// Reference to final output histograms (gmem) + CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS]; + + /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel + OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS]; + + /// The transform operator for determining privatized counter indices from samples, one for each channel + PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]; + + /// Whether to prefer privatized smem counters vs privatized global counters + bool prefer_smem; + + + //--------------------------------------------------------------------- + // Initialize privatized bin counters + //--------------------------------------------------------------------- + + // Initialize privatized bin counters + __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) + { + // Initialize histogram bin counts to zeros + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS) + { + privatized_histograms[CHANNEL][privatized_bin] = 0; + } + } + + // Barrier to make sure all threads are done updating counters + CTA_SYNC(); + } + + + // Initialize privatized bin counters. Specialized for privatized shared-memory counters + __device__ __forceinline__ void InitSmemBinCounters() + { + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; + + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; + + InitBinCounters(privatized_histograms); + } + + + // Initialize privatized bin counters. Specialized for privatized global-memory counters + __device__ __forceinline__ void InitGmemBinCounters() + { + InitBinCounters(d_privatized_histograms); + } + + + //--------------------------------------------------------------------- + // Update final output histograms + //--------------------------------------------------------------------- + + // Update final output histograms from privatized histograms + __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) + { + // Barrier to make sure all threads are done updating counters + CTA_SYNC(); + + // Apply privatized bin counts to output bin counts + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + int channel_bins = num_privatized_bins[CHANNEL]; + for (int privatized_bin = threadIdx.x; + privatized_bin < channel_bins; + privatized_bin += BLOCK_THREADS) + { + int output_bin = -1; + CounterT count = privatized_histograms[CHANNEL][privatized_bin]; + bool is_valid = count > 0; + + output_decode_op[CHANNEL].template BinSelect((SampleT) privatized_bin, output_bin, is_valid); + + if (output_bin >= 0) + { + atomicAdd(&d_output_histograms[CHANNEL][output_bin], count); + } + + } + } + } + + + // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters + __device__ __forceinline__ void StoreSmemOutput() + { + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; + + StoreOutput(privatized_histograms); + } + + + // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters + __device__ __forceinline__ void StoreGmemOutput() + { + StoreOutput(d_privatized_histograms); + } + + + //--------------------------------------------------------------------- + // Tile accumulation + //--------------------------------------------------------------------- + + // Accumulate pixels. Specialized for RLE compression. + __device__ __forceinline__ void AccumulatePixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD], + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], + Int2Type is_rle_compress) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + // Bin pixels + int bins[PIXELS_PER_THREAD]; + + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) + { + bins[PIXEL] = -1; + privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]); + } + + CounterT accumulator = 1; + + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL) + { + if (bins[PIXEL] != bins[PIXEL + 1]) + { + if (bins[PIXEL] >= 0) + atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator); + + accumulator = 0; + } + accumulator++; + } + + // Last pixel + if (bins[PIXELS_PER_THREAD - 1] >= 0) + atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator); + } + } + + + // Accumulate pixels. Specialized for individual accumulation of each pixel. + __device__ __forceinline__ void AccumulatePixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD], + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], + Int2Type is_rle_compress) + { + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + int bin = -1; + privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]); + if (bin >= 0) + atomicAdd(privatized_histograms[CHANNEL] + bin, 1); + } + } + } + + + /** + * Accumulate pixel, specialized for smem privatized histogram + */ + __device__ __forceinline__ void AccumulateSmemPixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD]) + { + CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; + + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; + + AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type()); + } + + + /** + * Accumulate pixel, specialized for gmem privatized histogram + */ + __device__ __forceinline__ void AccumulateGmemPixels( + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], + bool is_valid[PIXELS_PER_THREAD]) + { + AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type()); + } + + + + //--------------------------------------------------------------------- + // Tile loading + //--------------------------------------------------------------------- + + // Load full, aligned tile using pixel iterator (multi-channel) + template + __device__ __forceinline__ void LoadFullAlignedTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels) + { + typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; + + WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); + + // Load using a wrapped pixel iterator + BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( + d_wrapped_pixels, + reinterpret_cast(samples)); + } + + // Load full, aligned tile using quad iterator (single-channel) + __device__ __forceinline__ void LoadFullAlignedTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type<1> num_active_channels) + { + typedef QuadT AliasedQuads[QUADS_PER_THREAD]; + + WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset)); + + // Load using a wrapped quad iterator + BlockLoadQuadT(temp_storage.aliasable.quad_load).Load( + d_wrapped_quads, + reinterpret_cast(samples)); + } + + // Load full, aligned tile + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type()); + } + + // Load full, mis-aligned tile using sample iterator + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; + + // Load using sample iterator + BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( + d_wrapped_samples + block_offset, + reinterpret_cast(samples)); + } + + // Load partially-full, aligned tile using the pixel iterator + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; + + WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); + + int valid_pixels = valid_samples / NUM_CHANNELS; + + // Load using a wrapped pixel iterator + BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( + d_wrapped_pixels, + reinterpret_cast(samples), + valid_pixels); + } + + // Load partially-full, mis-aligned tile using sample iterator + __device__ __forceinline__ void LoadTile( + OffsetT block_offset, + int valid_samples, + SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], + Int2Type is_full_tile, + Int2Type is_aligned) + { + typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; + + BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( + d_wrapped_samples + block_offset, + reinterpret_cast(samples), + valid_samples); + } + + + //--------------------------------------------------------------------- + // Tile processing + //--------------------------------------------------------------------- + + // Consume a tile of data samples + template < + bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel) + bool IS_FULL_TILE> // Whether the tile is full + __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples) + { + SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS]; + bool is_valid[PIXELS_PER_THREAD]; + + // Load tile + LoadTile( + block_offset, + valid_samples, + samples, + Int2Type(), + Int2Type()); + + // Set valid flags + #pragma unroll + for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) + is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples); + + // Accumulate samples +#if CUB_PTX_ARCH >= 120 + if (prefer_smem) + AccumulateSmemPixels(samples, is_valid); + else + AccumulateGmemPixels(samples, is_valid); +#else + AccumulateGmemPixels(samples, is_valid); +#endif + + } + + + // Consume row tiles. Specialized for work-stealing from queue + template + __device__ __forceinline__ void ConsumeTiles( + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue, + Int2Type is_work_stealing) + { + + int num_tiles = num_rows * tiles_per_row; + int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x; + OffsetT num_even_share_tiles = gridDim.x * gridDim.y; + + while (tile_idx < num_tiles) + { + int row = tile_idx / tiles_per_row; + int col = tile_idx - (row * tiles_per_row); + OffsetT row_offset = row * row_stride_samples; + OffsetT col_offset = (col * TILE_SAMPLES); + OffsetT tile_offset = row_offset + col_offset; + + if (col == tiles_per_row - 1) + { + // Consume a partially-full tile at the end of the row + OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset; + ConsumeTile(tile_offset, num_remaining); + } + else + { + // Consume full tile + ConsumeTile(tile_offset, TILE_SAMPLES); + } + + CTA_SYNC(); + + // Get next tile + if (threadIdx.x == 0) + temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles; + + CTA_SYNC(); + + tile_idx = temp_storage.tile_idx; + } + } + + + // Consume row tiles. Specialized for even-share (striped across thread blocks) + template + __device__ __forceinline__ void ConsumeTiles( + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue, + Int2Type is_work_stealing) + { + for (int row = blockIdx.y; row < num_rows; row += gridDim.y) + { + OffsetT row_begin = row * row_stride_samples; + OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS); + OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES); + + while (tile_offset < row_end) + { + OffsetT num_remaining = row_end - tile_offset; + + if (num_remaining < TILE_SAMPLES) + { + // Consume partial tile + ConsumeTile(tile_offset, num_remaining); + break; + } + + // Consume full tile + ConsumeTile(tile_offset, TILE_SAMPLES); + tile_offset += gridDim.x * TILE_SAMPLES; + } + } + } + + + //--------------------------------------------------------------------- + // Parameter extraction + //--------------------------------------------------------------------- + + // Return a native pixel pointer (specialized for CacheModifiedInputIterator types) + template < + CacheLoadModifier _MODIFIER, + typename _ValueT, + typename _OffsetT> + __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr) + { + return itr.ptr; + } + + // Return a native pixel pointer (specialized for other types) + template + __device__ __forceinline__ SampleT* NativePointer(IteratorT itr) + { + return NULL; + } + + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + + /** + * Constructor + */ + __device__ __forceinline__ AgentHistogram( + TempStorage &temp_storage, ///< Reference to temp_storage + SampleIteratorT d_samples, ///< Input data to reduce + int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram + int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per privatized histogram + CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to final output histograms + CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to privatized histograms + OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel + PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) ///< The transform operator for determining privatized counter indices from samples, one for each channel + : + temp_storage(temp_storage.Alias()), + d_wrapped_samples(d_samples), + num_output_bins(num_output_bins), + num_privatized_bins(num_privatized_bins), + d_output_histograms(d_output_histograms), + privatized_decode_op(privatized_decode_op), + output_decode_op(output_decode_op), + d_native_samples(NativePointer(d_wrapped_samples)), + prefer_smem((MEM_PREFERENCE == SMEM) ? + true : // prefer smem privatized histograms + (MEM_PREFERENCE == GMEM) ? + false : // prefer gmem privatized histograms + blockIdx.x & 1) // prefer blended privatized histograms + { + int blockId = (blockIdx.y * gridDim.x) + blockIdx.x; + + // Initialize the locations of this block's privatized histograms + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]); + } + + + /** + * Consume image + */ + __device__ __forceinline__ void ConsumeTiles( + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks + { + // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel) + int quad_mask = AlignBytes::ALIGN_BYTES - 1; + int pixel_mask = AlignBytes::ALIGN_BYTES - 1; + size_t row_bytes = sizeof(SampleT) * row_stride_samples; + + bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel + ((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned + ((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad + + bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel + ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned + ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel + + // Whether rows are aligned and can be vectorized + if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows)) + ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); + else + ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); + } + + + /** + * Initialize privatized bin counters. Specialized for privatized shared-memory counters + */ + __device__ __forceinline__ void InitBinCounters() + { + if (prefer_smem) + InitSmemBinCounters(); + else + InitGmemBinCounters(); + } + + + /** + * Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters + */ + __device__ __forceinline__ void StoreOutput() + { + if (prefer_smem) + StoreSmemOutput(); + else + StoreGmemOutput(); + } + + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_downsweep.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_downsweep.cuh new file mode 100644 index 0000000..0eee5f4 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_downsweep.cuh @@ -0,0 +1,772 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . + */ + + +#pragma once + +#include + +#include "../thread/thread_load.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_radix_rank.cuh" +#include "../block/block_exchange.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Radix ranking algorithm + */ +enum RadixRankAlgorithm +{ + RADIX_RANK_BASIC, + RADIX_RANK_MEMOIZE, + RADIX_RANK_MATCH +}; + +/** + * Parameterizable tuning policy type for AgentRadixSortDownsweep + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values) + RadixRankAlgorithm _RANK_ALGORITHM, ///< The radix ranking algorithm to use + BlockScanAlgorithm _SCAN_ALGORITHM, ///< The block scan algorithm to use + int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) +struct AgentRadixSortDownsweepPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values) + static const RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; ///< The radix ranking algorithm to use + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + + + + + +/** + * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . + */ +template < + typename AgentRadixSortDownsweepPolicy, ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< KeyT type + typename ValueT, ///< ValueT type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentRadixSortDownsweep +{ + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + // Appropriate unsigned-bits representation of KeyT + typedef typename Traits::UnsignedBits UnsignedBits; + + static const UnsignedBits LOWEST_KEY = Traits::LOWEST_KEY; + static const UnsignedBits MAX_KEY = Traits::MAX_KEY; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM; + static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER; + static const RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM; + static const BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM; + + enum + { + BLOCK_THREADS = AgentRadixSortDownsweepPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD, + RADIX_BITS = AgentRadixSortDownsweepPolicy::RADIX_BITS, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + RADIX_DIGITS = 1 << RADIX_BITS, + KEYS_ONLY = Equals::VALUE, + }; + + // Input iterator wrapper type (for applying cache modifier)s + typedef CacheModifiedInputIterator KeysItr; + typedef CacheModifiedInputIterator ValuesItr; + + // Radix ranking type to use + typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC), + BlockRadixRank, + typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE), + BlockRadixRank, + BlockRadixRankMatch + >::Type + >::Type BlockRadixRankT; + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD + }; + + // BlockLoad type (keys) + typedef BlockLoad< + UnsignedBits, + BLOCK_THREADS, + ITEMS_PER_THREAD, + LOAD_ALGORITHM> BlockLoadKeysT; + + // BlockLoad type (values) + typedef BlockLoad< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + LOAD_ALGORITHM> BlockLoadValuesT; + + // Value exchange array type + typedef ValueT ValueExchangeT[TILE_ITEMS]; + + /** + * Shared memory storage layout + */ + union __align__(16) _TempStorage + { + typename BlockLoadKeysT::TempStorage load_keys; + typename BlockLoadValuesT::TempStorage load_values; + typename BlockRadixRankT::TempStorage radix_rank; + + struct + { + UnsignedBits exchange_keys[TILE_ITEMS]; + OffsetT relative_bin_offsets[RADIX_DIGITS]; + }; + + Uninitialized exchange_values; + + OffsetT exclusive_digit_prefix[RADIX_DIGITS]; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + // Shared storage for this CTA + _TempStorage &temp_storage; + + // Input and output device pointers + KeysItr d_keys_in; + ValuesItr d_values_in; + UnsignedBits *d_keys_out; + ValueT *d_values_out; + + // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) + OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; + + // The least-significant bit position of the current digit to extract + int current_bit; + + // Number of bits in current digit + int num_bits; + + // Whether to short-cirucit + int short_circuit; + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + + /** + * Scatter ranked keys through shared memory, then to device-accessible memory + */ + template + __device__ __forceinline__ void ScatterKeys( + UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], + OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + OffsetT valid_items) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + UnsignedBits key = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)]; + UnsignedBits digit = BFE(key, current_bit, num_bits); + relative_bin_offsets[ITEM] = temp_storage.relative_bin_offsets[digit]; + + // Un-twiddle + key = Traits::TwiddleOut(key); + + if (FULL_TILE || + (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) + { + d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key; + } + } + } + + + /** + * Scatter ranked values through shared memory, then to device-accessible memory + */ + template + __device__ __forceinline__ void ScatterValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + OffsetT valid_items) + { + CTA_SYNC(); + + ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + exchange_values[ranks[ITEM]] = values[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)]; + + if (FULL_TILE || + (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) + { + d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value; + } + } + } + + /** + * Load a tile of keys (specialized for full tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + BlockLoadKeysT(temp_storage.load_keys).Load( + d_keys_in + block_offset, keys); + + CTA_SYNC(); + } + + + /** + * Load a tile of keys (specialized for partial tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + BlockLoadKeysT(temp_storage.load_keys).Load( + d_keys_in + block_offset, keys, valid_items, oob_item); + + CTA_SYNC(); + } + + + /** + * Load a tile of keys (specialized for full tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys); + } + + + /** + * Load a tile of keys (specialized for partial tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadKeys( + UnsignedBits (&keys)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + UnsignedBits oob_item, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item); + } + + + /** + * Load a tile of values (specialized for full tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + BlockLoadValuesT(temp_storage.load_values).Load( + d_values_in + block_offset, values); + + CTA_SYNC(); + } + + + /** + * Load a tile of values (specialized for partial tile, any ranking algorithm) + */ + template + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type is_full_tile, + Int2Type<_RANK_ALGORITHM> rank_algorithm) + { + BlockLoadValuesT(temp_storage.load_values).Load( + d_values_in + block_offset, values, valid_items); + + CTA_SYNC(); + } + + + /** + * Load a tile of items (specialized for full tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + volatile OffsetT valid_items, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values); + } + + + /** + * Load a tile of items (specialized for partial tile, match ranking algorithm) + */ + __device__ __forceinline__ void LoadValues( + ValueT (&values)[ITEMS_PER_THREAD], + OffsetT block_offset, + volatile OffsetT valid_items, + Int2Type is_full_tile, + Int2Type rank_algorithm) + { + LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items); + } + + + /** + * Truck along associated values + */ + template + __device__ __forceinline__ void GatherScatterValues( + OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + OffsetT block_offset, + OffsetT valid_items, + Int2Type /*is_keys_only*/) + { + CTA_SYNC(); + + ValueT values[ITEMS_PER_THREAD]; + + LoadValues( + values, + block_offset, + valid_items, + Int2Type(), + Int2Type()); + + ScatterValues( + values, + relative_bin_offsets, + ranks, + valid_items); + } + + + /** + * Truck along associated values (specialized for key-only sorting) + */ + template + __device__ __forceinline__ void GatherScatterValues( + OffsetT (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD], + int (&/*ranks*/)[ITEMS_PER_THREAD], + OffsetT /*block_offset*/, + OffsetT /*valid_items*/, + Int2Type /*is_keys_only*/) + {} + + + /** + * Process tile + */ + template + __device__ __forceinline__ void ProcessTile( + OffsetT block_offset, + const OffsetT &valid_items = TILE_ITEMS) + { + UnsignedBits keys[ITEMS_PER_THREAD]; + int ranks[ITEMS_PER_THREAD]; + OffsetT relative_bin_offsets[ITEMS_PER_THREAD]; + + // Assign default (min/max) value to all keys + UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY; + + // Load tile of keys + LoadKeys( + keys, + block_offset, + valid_items, + default_key, + Int2Type(), + Int2Type()); + + // Twiddle key bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + keys[KEY] = Traits::TwiddleIn(keys[KEY]); + } + + // Rank the twiddled keys + int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; + BlockRadixRankT(temp_storage.radix_rank).RankKeys( + keys, + ranks, + current_bit, + num_bits, + exclusive_digit_prefix); + + CTA_SYNC(); + + // Share exclusive digit prefix + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + // Store exclusive prefix + temp_storage.exclusive_digit_prefix[bin_idx] = + exclusive_digit_prefix[track]; + } + } + + CTA_SYNC(); + + // Get inclusive digit prefix + int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + { + // Get inclusive digit prefix from exclusive prefix (higher bins come first) + inclusive_digit_prefix[track] = (bin_idx == 0) ? + (BLOCK_THREADS * ITEMS_PER_THREAD) : + temp_storage.exclusive_digit_prefix[bin_idx - 1]; + } + else + { + // Get inclusive digit prefix from exclusive prefix (lower bins come first) + inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ? + (BLOCK_THREADS * ITEMS_PER_THREAD) : + temp_storage.exclusive_digit_prefix[bin_idx + 1]; + } + } + } + + CTA_SYNC(); + + // Update global scatter base offsets for each digit + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + bin_offset[track] -= exclusive_digit_prefix[track]; + temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track]; + bin_offset[track] += inclusive_digit_prefix[track]; + } + } + + CTA_SYNC(); + + // Scatter keys + ScatterKeys(keys, relative_bin_offsets, ranks, valid_items); + + // Gather/scatter values + GatherScatterValues(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type()); + } + + //--------------------------------------------------------------------- + // Copy shortcut + //--------------------------------------------------------------------- + + /** + * Copy tiles within the range of input + */ + template < + typename InputIteratorT, + typename T> + __device__ __forceinline__ void Copy( + InputIteratorT d_in, + T *d_out, + OffsetT block_offset, + OffsetT block_end) + { + // Simply copy the input + while (block_offset + TILE_ITEMS <= block_end) + { + T items[ITEMS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_in + block_offset, items); + CTA_SYNC(); + StoreDirectStriped(threadIdx.x, d_out + block_offset, items); + + block_offset += TILE_ITEMS; + } + + // Clean up last partial tile with guarded-I/O + if (block_offset < block_end) + { + OffsetT valid_items = block_end - block_offset; + + T items[ITEMS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items); + CTA_SYNC(); + StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items); + } + } + + + /** + * Copy tiles within the range of input (specialized for NullType) + */ + template + __device__ __forceinline__ void Copy( + InputIteratorT /*d_in*/, + NullType * /*d_out*/, + OffsetT /*block_offset*/, + OffsetT /*block_end*/) + {} + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentRadixSortDownsweep( + TempStorage &temp_storage, + OffsetT (&bin_offset)[BINS_TRACKED_PER_THREAD], + OffsetT num_items, + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast(d_keys_in)), + d_values_in(d_values_in), + d_keys_out(reinterpret_cast(d_keys_out)), + d_values_out(d_values_out), + current_bit(current_bit), + num_bits(num_bits), + short_circuit(1) + { + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + this->bin_offset[track] = bin_offset[track]; + + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + // Short circuit if the histogram has only bin counts of only zeros or problem-size + short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items)); + } + } + + short_circuit = CTA_SYNC_AND(short_circuit); + } + + + /** + * Constructor + */ + __device__ __forceinline__ AgentRadixSortDownsweep( + TempStorage &temp_storage, + OffsetT num_items, + OffsetT *d_spine, + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast(d_keys_in)), + d_values_in(d_values_in), + d_keys_out(reinterpret_cast(d_keys_out)), + d_values_out(d_values_out), + current_bit(current_bit), + num_bits(num_bits), + short_circuit(1) + { + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit) + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size + OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx]; + short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items)); + + // Load my block's bin offset for my bin + bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x]; + } + } + + short_circuit = CTA_SYNC_AND(short_circuit); + } + + + /** + * Distribute keys from a segment of input tiles. + */ + __device__ __forceinline__ void ProcessRegion( + OffsetT block_offset, + OffsetT block_end) + { + if (short_circuit) + { + // Copy keys + Copy(d_keys_in, d_keys_out, block_offset, block_end); + + // Copy values + Copy(d_values_in, d_values_out, block_offset, block_end); + } + else + { + // Process full tiles of tile_items + while (block_offset + TILE_ITEMS <= block_end) + { + ProcessTile(block_offset); + block_offset += TILE_ITEMS; + + CTA_SYNC(); + } + + // Clean up last partial tile with guarded-I/O + if (block_offset < block_end) + { + ProcessTile(block_offset, block_end - block_offset); + } + + } + } + +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_upsweep.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_upsweep.cuh new file mode 100644 index 0000000..803fadf --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_upsweep.cuh @@ -0,0 +1,526 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . + */ + +#pragma once + +#include "../thread/thread_reduce.cuh" +#include "../thread/thread_load.cuh" +#include "../warp/warp_reduce.cuh" +#include "../block/block_load.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentRadixSortUpsweep + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys + int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) +struct AgentRadixSortUpsweepPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) + }; + + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . + */ +template < + typename AgentRadixSortUpsweepPolicy, ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type + typename KeyT, ///< KeyT type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentRadixSortUpsweep +{ + + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + typedef typename Traits::UnsignedBits UnsignedBits; + + // Integer type for digit counters (to be packed into words of PackedCounters) + typedef unsigned char DigitCounter; + + // Integer type for packing DigitCounters into columns of shared memory banks + typedef unsigned int PackedCounter; + + static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER; + + enum + { + RADIX_BITS = AgentRadixSortUpsweepPolicy::RADIX_BITS, + BLOCK_THREADS = AgentRadixSortUpsweepPolicy::BLOCK_THREADS, + KEYS_PER_THREAD = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD, + + BYTES_PER_COUNTER = sizeof(DigitCounter), + LOG_BYTES_PER_COUNTER = Log2::VALUE, + + PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), + LOG_PACKING_RATIO = Log2::VALUE, + + LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO), + COUNTER_LANES = 1 << LOG_COUNTER_LANES, + + // To prevent counter overflow, we must periodically unpack and aggregate the + // digit counters back into registers. Each counter lane is assigned to a + // warp for aggregation. + + LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS), + + // Unroll tiles in batches without risk of counter overflow + UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD), + UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS, + }; + + + // Input iterator wrapper type (for applying cache modifier)s + typedef CacheModifiedInputIterator KeysItr; + + /** + * Shared memory storage layout + */ + union __align__(16) _TempStorage + { + DigitCounter thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; + PackedCounter packed_thread_counters[COUNTER_LANES][BLOCK_THREADS]; + OffsetT block_counters[WARP_THREADS][RADIX_DIGITS]; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Thread fields (aggregate state bundle) + //--------------------------------------------------------------------- + + // Shared storage for this CTA + _TempStorage &temp_storage; + + // Thread-local counters for periodically aggregating composite-counter lanes + OffsetT local_counts[LANES_PER_WARP][PACKING_RATIO]; + + // Input and output device pointers + KeysItr d_keys_in; + + // The least-significant bit position of the current digit to extract + int current_bit; + + // Number of bits in current digit + int num_bits; + + + + //--------------------------------------------------------------------- + // Helper structure for templated iteration + //--------------------------------------------------------------------- + + // Iterate + template + struct Iterate + { + // BucketKeys + static __device__ __forceinline__ void BucketKeys( + AgentRadixSortUpsweep &cta, + UnsignedBits keys[KEYS_PER_THREAD]) + { + cta.Bucket(keys[COUNT]); + + // Next + Iterate::BucketKeys(cta, keys); + } + }; + + // Terminate + template + struct Iterate + { + // BucketKeys + static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {} + }; + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + /** + * Decode a key and increment corresponding smem digit counter + */ + __device__ __forceinline__ void Bucket(UnsignedBits key) + { + // Perform transform op + UnsignedBits converted_key = Traits::TwiddleIn(key); + + // Extract current digit bits + UnsignedBits digit = BFE(converted_key, current_bit, num_bits); + + // Get sub-counter offset + UnsignedBits sub_counter = digit & (PACKING_RATIO - 1); + + // Get row offset + UnsignedBits row_offset = digit >> LOG_PACKING_RATIO; + + // Increment counter + temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++; + } + + + /** + * Reset composite counters + */ + __device__ __forceinline__ void ResetDigitCounters() + { + #pragma unroll + for (int LANE = 0; LANE < COUNTER_LANES; LANE++) + { + temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0; + } + } + + + /** + * Reset the unpacked counters in each thread + */ + __device__ __forceinline__ void ResetUnpackedCounters() + { + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + local_counts[LANE][UNPACKED_COUNTER] = 0; + } + } + } + + + /** + * Extracts and aggregates the digit counters for each counter lane + * owned by this warp + */ + __device__ __forceinline__ void UnpackDigitCounts() + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = LaneId(); + + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + const int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + #pragma unroll + for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS) + { + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER]; + local_counts[LANE][UNPACKED_COUNTER] += counter; + } + } + } + } + } + + + /** + * Processes a single, full tile + */ + __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset) + { + // Tile of keys + UnsignedBits keys[KEYS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys); + + // Prevent hoisting + CTA_SYNC(); + + // Bucket tile of keys + Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys); + } + + + /** + * Processes a single load (may have some threads masked off) + */ + __device__ __forceinline__ void ProcessPartialTile( + OffsetT block_offset, + const OffsetT &block_end) + { + // Process partial tile if necessary using single loads + block_offset += threadIdx.x; + while (block_offset < block_end) + { + // Load and bucket key + UnsignedBits key = d_keys_in[block_offset]; + Bucket(key); + block_offset += BLOCK_THREADS; + } + } + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentRadixSortUpsweep( + TempStorage &temp_storage, + const KeyT *d_keys_in, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast(d_keys_in)), + current_bit(current_bit), + num_bits(num_bits) + {} + + + /** + * Compute radix digit histograms from a segment of input tiles. + */ + __device__ __forceinline__ void ProcessRegion( + OffsetT block_offset, + const OffsetT &block_end) + { + // Reset digit counters in smem and unpacked counters in registers + ResetDigitCounters(); + ResetUnpackedCounters(); + + // Unroll batches of full tiles + while (block_offset + UNROLLED_ELEMENTS <= block_end) + { + for (int i = 0; i < UNROLL_COUNT; ++i) + { + ProcessFullTile(block_offset); + block_offset += TILE_ITEMS; + } + + CTA_SYNC(); + + // Aggregate back into local_count registers to prevent overflow + UnpackDigitCounts(); + + CTA_SYNC(); + + // Reset composite counters in lanes + ResetDigitCounters(); + } + + // Unroll single full tiles + while (block_offset + TILE_ITEMS <= block_end) + { + ProcessFullTile(block_offset); + block_offset += TILE_ITEMS; + } + + // Process partial tile if necessary + ProcessPartialTile( + block_offset, + block_end); + + CTA_SYNC(); + + // Aggregate back into local_count registers + UnpackDigitCounts(); + } + + + /** + * Extract counts (saving them to the external array) + */ + template + __device__ __forceinline__ void ExtractCounts( + OffsetT *counters, + int bin_stride = 1, + int bin_offset = 0) + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = LaneId(); + + // Place unpacked digit counters in shared memory + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + int digit_row = counter_lane << LOG_PACKING_RATIO; + + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + int bin_idx = digit_row + UNPACKED_COUNTER; + + temp_storage.block_counters[warp_tid][bin_idx] = + local_counts[LANE][UNPACKED_COUNTER]; + } + } + } + + CTA_SYNC(); + + // Rake-reduce bin_count reductions + + // Whole blocks + #pragma unroll + for (int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS; + (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS; + BIN_BASE += BLOCK_THREADS) + { + int bin_idx = BIN_BASE + threadIdx.x; + + OffsetT bin_count = 0; + #pragma unroll + for (int i = 0; i < WARP_THREADS; ++i) + bin_count += temp_storage.block_counters[i][bin_idx]; + + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + counters[(bin_stride * bin_idx) + bin_offset] = bin_count; + } + + // Remainder + if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS)) + { + int bin_idx = threadIdx.x; + + OffsetT bin_count = 0; + #pragma unroll + for (int i = 0; i < WARP_THREADS; ++i) + bin_count += temp_storage.block_counters[i][bin_idx]; + + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + counters[(bin_stride * bin_idx) + bin_offset] = bin_count; + } + } + + + /** + * Extract counts + */ + template + __device__ __forceinline__ void ExtractCounts( + OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = LaneId(); + + // Place unpacked digit counters in shared memory + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + int digit_row = counter_lane << LOG_PACKING_RATIO; + + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + int bin_idx = digit_row + UNPACKED_COUNTER; + + temp_storage.block_counters[warp_tid][bin_idx] = + local_counts[LANE][UNPACKED_COUNTER]; + } + } + } + + CTA_SYNC(); + + // Rake-reduce bin_count reductions + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + bin_count[track] = 0; + + #pragma unroll + for (int i = 0; i < WARP_THREADS; ++i) + bin_count[track] += temp_storage.block_counters[i][bin_idx]; + } + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce.cuh new file mode 100644 index 0000000..5528d8b --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce.cuh @@ -0,0 +1,385 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . + */ + +#pragma once + +#include + +#include "../block/block_load.cuh" +#include "../block/block_reduce.cuh" +#include "../grid/grid_mapping.cuh" +#include "../grid/grid_even_share.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentReduce + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load + BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use + CacheLoadModifier _LOAD_MODIFIER> ///< Cache load modifier for reading input elements +struct AgentReducePolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load + }; + + static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements +}; + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . + * + * Each thread reduces only the values it loads. If \p FIRST_TILE, this + * partial reduction is stored into \p thread_aggregate. Otherwise it is + * accumulated into \p thread_aggregate. + */ +template < + typename AgentReducePolicy, ///< Parameterized AgentReducePolicy tuning policy type + typename InputIteratorT, ///< Random-access iterator type for input + typename OutputIteratorT, ///< Random-access iterator type for output + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b) +struct AgentReduce +{ + + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The input value type + typedef typename std::iterator_traits::value_type InputT; + + /// The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + /// Vector type of InputT for data movement + typedef typename CubVector::Type VectorT; + + /// Input iterator wrapper type (for applying cache modifier) + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + /// Constants + enum + { + BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD, + VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH), + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type + ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) && + (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && + (IsPointer::VALUE) && Traits::PRIMITIVE, + + }; + + static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER; + static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM; + + /// Parameterized BlockReduce primitive + typedef BlockReduce BlockReduceT; + + /// Shared memory type required by this thread block + struct _TempStorage + { + typename BlockReduceT::TempStorage reduce; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + InputIteratorT d_in; ///< Input data to reduce + WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce + ReductionOp reduction_op; ///< Binary reduction operator + + + //--------------------------------------------------------------------- + // Utility + //--------------------------------------------------------------------- + + + // Whether or not the input is aligned with the vector type (specialized for types we can vectorize) + template + static __device__ __forceinline__ bool IsAligned( + Iterator d_in, + Int2Type /*can_vectorize*/) + { + return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; + } + + // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize) + template + static __device__ __forceinline__ bool IsAligned( + Iterator /*d_in*/, + Int2Type /*can_vectorize*/) + { + return false; + } + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentReduce( + TempStorage& temp_storage, ///< Reference to temp_storage + InputIteratorT d_in, ///< Input data to reduce + ReductionOp reduction_op) ///< Binary reduction operator + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_wrapped_in(d_in), + reduction_op(reduction_op) + {} + + + //--------------------------------------------------------------------- + // Tile consumption + //--------------------------------------------------------------------- + + /** + * Consume a full tile of input (non-vectorized) + */ + template + __device__ __forceinline__ void ConsumeTile( + OutputT &thread_aggregate, + OffsetT block_offset, ///< The offset the tile to consume + int /*valid_items*/, ///< The number of valid items in the tile + Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile + Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads + { + OutputT items[ITEMS_PER_THREAD]; + + // Load items in striped fashion + LoadDirectStriped(threadIdx.x, d_wrapped_in + block_offset, items); + + // Reduce items within each thread stripe + thread_aggregate = (IS_FIRST_TILE) ? + internal::ThreadReduce(items, reduction_op) : + internal::ThreadReduce(items, reduction_op, thread_aggregate); + } + + + /** + * Consume a full tile of input (vectorized) + */ + template + __device__ __forceinline__ void ConsumeTile( + OutputT &thread_aggregate, + OffsetT block_offset, ///< The offset the tile to consume + int /*valid_items*/, ///< The number of valid items in the tile + Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile + Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads + { + // Alias items as an array of VectorT and load it in striped fashion + enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; + + // Fabricate a vectorized input iterator + InputT *d_in_unqualified = const_cast(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); + CacheModifiedInputIterator d_vec_in( + reinterpret_cast(d_in_unqualified)); + + // Load items as vector items + InputT input_items[ITEMS_PER_THREAD]; + VectorT *vec_items = reinterpret_cast(input_items); + #pragma unroll + for (int i = 0; i < WORDS; ++i) + vec_items[i] = d_vec_in[BLOCK_THREADS * i]; + + // Convert from input type to output type + OutputT items[ITEMS_PER_THREAD]; + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + items[i] = input_items[i]; + + // Reduce items within each thread stripe + thread_aggregate = (IS_FIRST_TILE) ? + internal::ThreadReduce(items, reduction_op) : + internal::ThreadReduce(items, reduction_op, thread_aggregate); + } + + + /** + * Consume a partial tile of input + */ + template + __device__ __forceinline__ void ConsumeTile( + OutputT &thread_aggregate, + OffsetT block_offset, ///< The offset the tile to consume + int valid_items, ///< The number of valid items in the tile + Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile + Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads + { + // Partial tile + int thread_offset = threadIdx.x; + + // Read first item + if ((IS_FIRST_TILE) && (thread_offset < valid_items)) + { + thread_aggregate = d_wrapped_in[block_offset + thread_offset]; + thread_offset += BLOCK_THREADS; + } + + // Continue reading items (block-striped) + while (thread_offset < valid_items) + { + OutputT item = d_wrapped_in[block_offset + thread_offset]; + thread_aggregate = reduction_op(thread_aggregate, item); + thread_offset += BLOCK_THREADS; + } + } + + + //--------------------------------------------------------------- + // Consume a contiguous segment of tiles + //--------------------------------------------------------------------- + + /** + * \brief Reduce a contiguous segment of input tiles + */ + template + __device__ __forceinline__ OutputT ConsumeRange( + GridEvenShare &even_share, ///< GridEvenShare descriptor + Int2Type can_vectorize) ///< Whether or not we can vectorize loads + { + OutputT thread_aggregate; + + if (even_share.block_offset + TILE_ITEMS > even_share.block_end) + { + // First tile isn't full (not all threads have valid items) + int valid_items = even_share.block_end - even_share.block_offset; + ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); + return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items); + } + + // At least one full block + ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); + even_share.block_offset += even_share.block_stride; + + // Consume subsequent full tiles of input + while (even_share.block_offset + TILE_ITEMS <= even_share.block_end) + { + ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); + even_share.block_offset += even_share.block_stride; + } + + // Consume a partially-full tile + if (even_share.block_offset < even_share.block_end) + { + int valid_items = even_share.block_end - even_share.block_offset; + ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); + } + + // Compute block-wide reduction (all threads have valid items) + return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op); + } + + + /** + * \brief Reduce a contiguous segment of input tiles + */ + __device__ __forceinline__ OutputT ConsumeRange( + OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT block_end) ///< [in] Threadblock end offset (exclusive) + { + GridEvenShare even_share; + even_share.template BlockInit(block_offset, block_end); + + return (IsAligned(d_in + block_offset, Int2Type())) ? + ConsumeRange(even_share, Int2Type()) : + ConsumeRange(even_share, Int2Type()); + } + + + /** + * Reduce a contiguous segment of input tiles + */ + __device__ __forceinline__ OutputT ConsumeTiles( + GridEvenShare &even_share) ///< [in] GridEvenShare descriptor + { + // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block + even_share.template BlockInit(); + + return (IsAligned(d_in, Int2Type())) ? + ConsumeRange(even_share, Int2Type()) : + ConsumeRange(even_share, Int2Type()); + + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce_by_key.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce_by_key.cuh new file mode 100644 index 0000000..a57d60e --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce_by_key.cuh @@ -0,0 +1,549 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_discontinuity.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/constant_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentReduceByKey + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentReduceByKeyPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key + */ +template < + typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicy tuning policy type + typename KeysInputIteratorT, ///< Random-access input iterator type for keys + typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys + typename ValuesInputIteratorT, ///< Random-access input iterator type for values + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of items selected + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentReduceByKey +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input keys type + typedef typename std::iterator_traits::value_type KeyInputT; + + // The output keys type + typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type + + // The input values type + typedef typename std::iterator_traits::value_type ValueInputT; + + // The output values type + typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type + + // Tuple type for scanning (pairs accumulated segment-value with segment-index) + typedef KeyValuePair OffsetValuePairT; + + // Tuple type for pairing keys and values + typedef KeyValuePair KeyValuePairT; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Guarded inequality functor + template + struct GuardedInequalityWrapper + { + _EqualityOpT op; ///< Wrapped equality operator + int num_remaining; ///< Items remaining + + /// Constructor + __host__ __device__ __forceinline__ + GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {} + + /// Boolean inequality operator, returns (a != b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const + { + if (idx < num_remaining) + return !op(a, b); // In bounds + + // Return true if first out-of-bounds item, false otherwise + return (idx == num_remaining); + } + }; + + + // Constants + enum + { + BLOCK_THREADS = AgentReduceByKeyPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), + + // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) + HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), + }; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + KeysInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedKeysInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + ValuesInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedValuesInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + AggregatesOutputIteratorT>::Type // Directly use the supplied input iterator type + WrappedFixupInputIteratorT; + + // Reduce-value-by-segment scan operator + typedef ReduceBySegmentOp ReduceBySegmentOpT; + + // Parameterized BlockLoad type for keys + typedef BlockLoad< + KeyOutputT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentReduceByKeyPolicyT::LOAD_ALGORITHM> + BlockLoadKeysT; + + // Parameterized BlockLoad type for values + typedef BlockLoad< + ValueOutputT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentReduceByKeyPolicyT::LOAD_ALGORITHM> + BlockLoadValuesT; + + // Parameterized BlockDiscontinuity type for keys + typedef BlockDiscontinuity< + KeyOutputT, + BLOCK_THREADS> + BlockDiscontinuityKeys; + + // Parameterized BlockScan type + typedef BlockScan< + OffsetValuePairT, + BLOCK_THREADS, + AgentReduceByKeyPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + OffsetValuePairT, + ReduceBySegmentOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Key and value exchange types + typedef KeyOutputT KeyExchangeT[TILE_ITEMS + 1]; + typedef ValueOutputT ValueExchangeT[TILE_ITEMS + 1]; + + // Shared memory type for this thread block + union _TempStorage + { + struct + { + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockDiscontinuityKeys::TempStorage discontinuity; // Smem needed for discontinuity detection + }; + + // Smem needed for loading keys + typename BlockLoadKeysT::TempStorage load_keys; + + // Smem needed for loading values + typename BlockLoadValuesT::TempStorage load_values; + + // Smem needed for compacting key value pairs(allows non POD items in this union) + Uninitialized raw_exchange; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedKeysInputIteratorT d_keys_in; ///< Input keys + UniqueOutputIteratorT d_unique_out; ///< Unique output keys + WrappedValuesInputIteratorT d_values_in; ///< Input values + AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates + NumRunsOutputIteratorT d_num_runs_out; ///< Output pointer for total number of segments identified + EqualityOpT equality_op; ///< KeyT equality operator + ReductionOpT reduction_op; ///< Reduction operator + ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentReduceByKey( + TempStorage& temp_storage, ///< Reference to temp_storage + KeysInputIteratorT d_keys_in, ///< Input keys + UniqueOutputIteratorT d_unique_out, ///< Unique output keys + ValuesInputIteratorT d_values_in, ///< Input values + AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates + NumRunsOutputIteratorT d_num_runs_out, ///< Output pointer for total number of segments identified + EqualityOpT equality_op, ///< KeyT equality operator + ReductionOpT reduction_op) ///< ValueT reduction operator + : + temp_storage(temp_storage.Alias()), + d_keys_in(d_keys_in), + d_unique_out(d_unique_out), + d_values_in(d_values_in), + d_aggregates_out(d_aggregates_out), + d_num_runs_out(d_num_runs_out), + equality_op(equality_op), + reduction_op(reduction_op), + scan_op(reduction_op) + {} + + + //--------------------------------------------------------------------- + // Scatter utility methods + //--------------------------------------------------------------------- + + /** + * Directly scatter flagged items to output offsets + */ + __device__ __forceinline__ void ScatterDirect( + KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], + OffsetT (&segment_flags)[ITEMS_PER_THREAD], + OffsetT (&segment_indices)[ITEMS_PER_THREAD]) + { + // Scatter flagged keys and values + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (segment_flags[ITEM]) + { + d_unique_out[segment_indices[ITEM]] = scatter_items[ITEM].key; + d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value; + } + } + } + + + /** + * 2-phase scatter flagged items to output offsets + * + * The exclusive scan causes each head flag to be paired with the previous + * value aggregate: the scatter offsets must be decremented for value aggregates + */ + __device__ __forceinline__ void ScatterTwoPhase( + KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], + OffsetT (&segment_flags)[ITEMS_PER_THREAD], + OffsetT (&segment_indices)[ITEMS_PER_THREAD], + OffsetT num_tile_segments, + OffsetT num_tile_segments_prefix) + { + CTA_SYNC(); + + // Compact and scatter pairs + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (segment_flags[ITEM]) + { + temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM]; + } + } + + CTA_SYNC(); + + for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) + { + KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item]; + d_unique_out[num_tile_segments_prefix + item] = pair.key; + d_aggregates_out[num_tile_segments_prefix + item] = pair.value; + } + } + + + /** + * Scatter flagged items + */ + __device__ __forceinline__ void Scatter( + KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], + OffsetT (&segment_flags)[ITEMS_PER_THREAD], + OffsetT (&segment_indices)[ITEMS_PER_THREAD], + OffsetT num_tile_segments, + OffsetT num_tile_segments_prefix) + { + // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one + if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS)) + { + ScatterTwoPhase( + scatter_items, + segment_flags, + segment_indices, + num_tile_segments, + num_tile_segments_prefix); + } + else + { + ScatterDirect( + scatter_items, + segment_flags, + segment_indices); + } + } + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic chained scan) + */ + template ///< Whether the current tile is the last tile + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + KeyOutputT keys[ITEMS_PER_THREAD]; // Tile keys + KeyOutputT prev_keys[ITEMS_PER_THREAD]; // Tile keys shuffled up + ValueOutputT values[ITEMS_PER_THREAD]; // Tile values + OffsetT head_flags[ITEMS_PER_THREAD]; // Segment head flags + OffsetT segment_indices[ITEMS_PER_THREAD]; // Segment indices + OffsetValuePairT scan_items[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices + KeyValuePairT scatter_items[ITEMS_PER_THREAD]; // Zipped key value pairs for scattering + + // Load keys + if (IS_LAST_TILE) + BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining); + else + BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); + + // Load tile predecessor key in first thread + KeyOutputT tile_predecessor; + if (threadIdx.x == 0) + { + tile_predecessor = (tile_idx == 0) ? + keys[0] : // First tile gets repeat of first item (thus first item will not be flagged as a head) + d_keys_in[tile_offset - 1]; // Subsequent tiles get last key from previous tile + } + + CTA_SYNC(); + + // Load values + if (IS_LAST_TILE) + BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining); + else + BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values); + + CTA_SYNC(); + + // Initialize head-flags and shuffle up the previous keys + if (IS_LAST_TILE) + { + // Use custom flag operator to additionally flag the first out-of-bounds item + GuardedInequalityWrapper flag_op(equality_op, num_remaining); + BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads( + head_flags, keys, prev_keys, flag_op, tile_predecessor); + } + else + { + InequalityWrapper flag_op(equality_op); + BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads( + head_flags, keys, prev_keys, flag_op, tile_predecessor); + } + + // Zip values and head flags + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + scan_items[ITEM].value = values[ITEM]; + scan_items[ITEM].key = head_flags[ITEM]; + } + + // Perform exclusive tile scan + OffsetValuePairT block_aggregate; // Inclusive block-wide scan aggregate + OffsetT num_segments_prefix; // Number of segments prior to this tile + ValueOutputT total_aggregate; // The tile prefix folded with block_aggregate + if (tile_idx == 0) + { + // Scan first tile + BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate); + num_segments_prefix = 0; + total_aggregate = block_aggregate.value; + + // Update tile status if there are successor tiles + if ((!IS_LAST_TILE) && (threadIdx.x == 0)) + tile_state.SetInclusive(0, block_aggregate); + } + else + { + // Scan non-first tile + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); + BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op); + + block_aggregate = prefix_op.GetBlockAggregate(); + num_segments_prefix = prefix_op.GetExclusivePrefix().key; + total_aggregate = reduction_op( + prefix_op.GetExclusivePrefix().value, + block_aggregate.value); + } + + // Rezip scatter items and segment indices + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + scatter_items[ITEM].key = prev_keys[ITEM]; + scatter_items[ITEM].value = scan_items[ITEM].value; + segment_indices[ITEM] = scan_items[ITEM].key; + } + + // At this point, each flagged segment head has: + // - The key for the previous segment + // - The reduced value from the previous segment + // - The segment index for the reduced value + + // Scatter flagged keys and values + OffsetT num_tile_segments = block_aggregate.key; + Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix); + + // Last thread in last tile will output final count (and last pair, if necessary) + if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1)) + { + OffsetT num_segments = num_segments_prefix + num_tile_segments; + + // If the last tile is a whole tile, output the final_value + if (num_remaining == TILE_ITEMS) + { + d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1]; + d_aggregates_out[num_segments] = total_aggregate; + num_segments++; + } + + // Output the total number of items selected + *d_num_runs_out = num_segments; + } + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + ScanTileStateT& tile_state, ///< Global tile state descriptor + int start_tile) ///< The starting tile for the current grid + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = start_tile + blockIdx.x; // Current tile index + OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Not last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + else if (num_remaining > 0) + { + // Last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_rle.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_rle.cuh new file mode 100644 index 0000000..0ba9216 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_rle.cuh @@ -0,0 +1,837 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../block/block_discontinuity.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/constant_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentRle + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentRlePolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode + */ +template < + typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for data + typename OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values + typename LengthsOutputIteratorT, ///< Random-access output iterator type for length values + typename EqualityOpT, ///< T equality operator type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentRle +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The input value type + typedef typename std::iterator_traits::value_type T; + + /// The lengths output value type + typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? + OffsetT, // ... then the OffsetT type, + typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type + + /// Tuple type for scanning (pairs run-length and run-index) + typedef KeyValuePair LengthOffsetPair; + + /// Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Constants + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD, + WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + /// Whether or not to sync after loading data + SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), + + /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING, + ACTIVE_EXCHANGE_WARPS = (STORE_WARP_TIME_SLICING) ? 1 : WARPS, + }; + + + /** + * Special operator that signals all out-of-bounds items are not equal to everything else, + * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked + * trivial. + */ + template + struct OobInequalityOp + { + OffsetT num_remaining; + EqualityOpT equality_op; + + __device__ __forceinline__ OobInequalityOp( + OffsetT num_remaining, + EqualityOpT equality_op) + : + num_remaining(num_remaining), + equality_op(equality_op) + {} + + template + __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx) + { + if (!LAST_TILE || (idx < num_remaining)) + return !equality_op(first, second); + else + return true; + } + }; + + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for data + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedVLengthnputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + // Parameterized BlockLoad type for data + typedef BlockLoad< + T, + AgentRlePolicyT::BLOCK_THREADS, + AgentRlePolicyT::ITEMS_PER_THREAD, + AgentRlePolicyT::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockDiscontinuity type for data + typedef BlockDiscontinuity BlockDiscontinuityT; + + // Parameterized WarpScan type + typedef WarpScan WarpScanPairs; + + // Reduce-length-by-run scan operator + typedef ReduceBySegmentOp ReduceBySegmentOpT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + LengthOffsetPair, + ReduceBySegmentOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Warp exchange types + typedef WarpExchange WarpExchangePairs; + + typedef typename If::Type WarpExchangePairsStorage; + + typedef WarpExchange WarpExchangeOffsets; + typedef WarpExchange WarpExchangeLengths; + + typedef LengthOffsetPair WarpAggregates[WARPS]; + + // Shared memory type for this thread block + struct _TempStorage + { + // Aliasable storage layout + union Aliasable + { + struct + { + typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection + typename WarpScanPairs::TempStorage warp_scan[WARPS]; // Smem needed for warp-synchronous scans + Uninitialized warp_aggregates; // Smem needed for sharing warp-wide aggregates + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + }; + + // Smem needed for input loading + typename BlockLoadT::TempStorage load; + + // Aliasable layout needed for two-phase scatter + union ScatterAliasable + { + unsigned long long align; + WarpExchangePairsStorage exchange_pairs[ACTIVE_EXCHANGE_WARPS]; + typename WarpExchangeOffsets::TempStorage exchange_offsets[ACTIVE_EXCHANGE_WARPS]; + typename WarpExchangeLengths::TempStorage exchange_lengths[ACTIVE_EXCHANGE_WARPS]; + + } scatter_aliasable; + + } aliasable; + + OffsetT tile_idx; // Shared tile index + LengthOffsetPair tile_inclusive; // Inclusive tile prefix + LengthOffsetPair tile_exclusive; // Exclusive tile prefix + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + + WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets + LengthsOutputIteratorT d_lengths_out; ///< Output run lengths + + EqualityOpT equality_op; ///< T equality operator + ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator + OffsetT num_items; ///< Total number of input items + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentRle( + TempStorage &temp_storage, ///< [in] Reference to temp_storage + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run lengths + EqualityOpT equality_op, ///< [in] T equality operator + OffsetT num_items) ///< [in] Total number of input items + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_offsets_out(d_offsets_out), + d_lengths_out(d_lengths_out), + equality_op(equality_op), + scan_op(cub::Sum()), + num_items(num_items) + {} + + + //--------------------------------------------------------------------- + // Utility methods for initializing the selections + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void InitializeSelections( + OffsetT tile_offset, + OffsetT num_remaining, + T (&items)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) + { + bool head_flags[ITEMS_PER_THREAD]; + bool tail_flags[ITEMS_PER_THREAD]; + + OobInequalityOp inequality_op(num_remaining, equality_op); + + if (FIRST_TILE && LAST_TILE) + { + // First-and-last-tile always head-flags the first item and tail-flags the last item + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tail_flags, items, inequality_op); + } + else if (FIRST_TILE) + { + // First-tile always head-flags the first item + + // Get the first item from the next tile + T tile_successor_item; + if (threadIdx.x == BLOCK_THREADS - 1) + tile_successor_item = d_in[tile_offset + TILE_ITEMS]; + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tail_flags, tile_successor_item, items, inequality_op); + } + else if (LAST_TILE) + { + // Last-tile always flags the last item + + // Get the last item from the previous tile + T tile_predecessor_item; + if (threadIdx.x == 0) + tile_predecessor_item = d_in[tile_offset - 1]; + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tile_predecessor_item, tail_flags, items, inequality_op); + } + else + { + // Get the first item from the next tile + T tile_successor_item; + if (threadIdx.x == BLOCK_THREADS - 1) + tile_successor_item = d_in[tile_offset + TILE_ITEMS]; + + // Get the last item from the previous tile + T tile_predecessor_item; + if (threadIdx.x == 0) + tile_predecessor_item = d_in[tile_offset - 1]; + + BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( + head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op); + } + + // Zip counts and runs + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + lengths_and_num_runs[ITEM].key = head_flags[ITEM] && (!tail_flags[ITEM]); + lengths_and_num_runs[ITEM].value = ((!head_flags[ITEM]) || (!tail_flags[ITEM])); + } + } + + //--------------------------------------------------------------------- + // Scan utility methods + //--------------------------------------------------------------------- + + /** + * Scan of allocations + */ + __device__ __forceinline__ void WarpScanAllocations( + LengthOffsetPair &tile_aggregate, + LengthOffsetPair &warp_aggregate, + LengthOffsetPair &warp_exclusive_in_tile, + LengthOffsetPair &thread_exclusive_in_warp, + LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) + { + // Perform warpscans + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + int lane_id = LaneId(); + + LengthOffsetPair identity; + identity.key = 0; + identity.value = 0; + + LengthOffsetPair thread_inclusive; + LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op); + WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan( + thread_aggregate, + thread_inclusive, + thread_exclusive_in_warp, + identity, + scan_op); + + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive; + + CTA_SYNC(); + + // Accumulate total selected and the warp-wide prefix + warp_exclusive_in_tile = identity; + warp_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[warp_id]; + tile_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[0]; + + #pragma unroll + for (int WARP = 1; WARP < WARPS; ++WARP) + { + if (warp_id == WARP) + warp_exclusive_in_tile = tile_aggregate; + + tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]); + } + } + + + //--------------------------------------------------------------------- + // Utility methods for scattering selections + //--------------------------------------------------------------------- + + /** + * Two-phase scatter, specialized for warp time-slicing + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], + Int2Type is_warp_time_slice) + { + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + int lane_id = LaneId(); + + // Locally compact items within the warp (first warp) + if (warp_id == 0) + { + WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( + lengths_and_offsets, thread_num_runs_exclusive_in_warp); + } + + // Locally compact items within the warp (remaining warps) + #pragma unroll + for (int SLICE = 1; SLICE < WARPS; ++SLICE) + { + CTA_SYNC(); + + if (warp_id == SLICE) + { + WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( + lengths_and_offsets, thread_num_runs_exclusive_in_warp); + } + } + + // Global scatter + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id) + { + OffsetT item_offset = + tile_num_runs_exclusive_in_global + + warp_num_runs_exclusive_in_tile + + (ITEM * WARP_THREADS) + lane_id; + + // Scatter offset + d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; + + // Scatter length if not the first (global) length + if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) + { + d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; + } + } + } + } + + + /** + * Two-phase scatter + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], + Int2Type is_warp_time_slice) + { + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + int lane_id = LaneId(); + + // Unzip + OffsetT run_offsets[ITEMS_PER_THREAD]; + LengthT run_lengths[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + run_offsets[ITEM] = lengths_and_offsets[ITEM].key; + run_lengths[ITEM] = lengths_and_offsets[ITEM].value; + } + + WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped( + run_offsets, thread_num_runs_exclusive_in_warp); + + WARP_SYNC(0xffffffff); + + WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped( + run_lengths, thread_num_runs_exclusive_in_warp); + + // Global scatter + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate) + { + OffsetT item_offset = + tile_num_runs_exclusive_in_global + + warp_num_runs_exclusive_in_tile + + (ITEM * WARP_THREADS) + lane_id; + + // Scatter offset + d_offsets_out[item_offset] = run_offsets[ITEM]; + + // Scatter length if not the first (global) length + if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) + { + d_lengths_out[item_offset - 1] = run_lengths[ITEM]; + } + } + } + } + + + /** + * Direct scatter + */ + template + __device__ __forceinline__ void ScatterDirect( + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate) + { + OffsetT item_offset = + tile_num_runs_exclusive_in_global + + warp_num_runs_exclusive_in_tile + + thread_num_runs_exclusive_in_warp[ITEM]; + + // Scatter offset + d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; + + // Scatter length if not the first (global) length + if (item_offset >= 1) + { + d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; + } + } + } + } + + + /** + * Scatter + */ + template + __device__ __forceinline__ void Scatter( + OffsetT tile_num_runs_aggregate, + OffsetT tile_num_runs_exclusive_in_global, + OffsetT warp_num_runs_aggregate, + OffsetT warp_num_runs_exclusive_in_tile, + OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], + LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) + { + if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS)) + { + // Direct scatter if the warp has any items + if (warp_num_runs_aggregate) + { + ScatterDirect( + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets); + } + } + else + { + // Scatter two phase + ScatterTwoPhase( + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets, + Int2Type()); + } + } + + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic chained scan) + */ + template < + bool LAST_TILE> + __device__ __forceinline__ LengthOffsetPair ConsumeTile( + OffsetT num_items, ///< Total number of global input items + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT &tile_status) ///< Global list of tile status + { + if (tile_idx == 0) + { + // First tile + + // Load items + T items[ITEMS_PER_THREAD]; + if (LAST_TILE) + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); + else + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); + + if (SYNC_AFTER_LOAD) + CTA_SYNC(); + + // Set flags + LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; + + InitializeSelections( + tile_offset, + num_remaining, + items, + lengths_and_num_runs); + + // Exclusive scan of lengths and runs + LengthOffsetPair tile_aggregate; + LengthOffsetPair warp_aggregate; + LengthOffsetPair warp_exclusive_in_tile; + LengthOffsetPair thread_exclusive_in_warp; + + WarpScanAllocations( + tile_aggregate, + warp_aggregate, + warp_exclusive_in_tile, + thread_exclusive_in_warp, + lengths_and_num_runs); + + // Update tile status if this is not the last tile + if (!LAST_TILE && (threadIdx.x == 0)) + tile_status.SetInclusive(0, tile_aggregate); + + // Update thread_exclusive_in_warp to fold in warp run-length + if (thread_exclusive_in_warp.key == 0) + thread_exclusive_in_warp.value += warp_exclusive_in_tile.value; + + LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; + OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; + LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; + + // Downsweep scan through lengths_and_num_runs + internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); + + // Zip + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; + lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? + lengths_and_num_runs2[ITEM].key : // keep + WARP_THREADS * ITEMS_PER_THREAD; // discard + } + + OffsetT tile_num_runs_aggregate = tile_aggregate.key; + OffsetT tile_num_runs_exclusive_in_global = 0; + OffsetT warp_num_runs_aggregate = warp_aggregate.key; + OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; + + // Scatter + Scatter( + tile_num_runs_aggregate, + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets); + + // Return running total (inclusive of this tile) + return tile_aggregate; + } + else + { + // Not first tile + + // Load items + T items[ITEMS_PER_THREAD]; + if (LAST_TILE) + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); + else + BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); + + if (SYNC_AFTER_LOAD) + CTA_SYNC(); + + // Set flags + LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; + + InitializeSelections( + tile_offset, + num_remaining, + items, + lengths_and_num_runs); + + // Exclusive scan of lengths and runs + LengthOffsetPair tile_aggregate; + LengthOffsetPair warp_aggregate; + LengthOffsetPair warp_exclusive_in_tile; + LengthOffsetPair thread_exclusive_in_warp; + + WarpScanAllocations( + tile_aggregate, + warp_aggregate, + warp_exclusive_in_tile, + thread_exclusive_in_warp, + lengths_and_num_runs); + + // First warp computes tile prefix in lane 0 + TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx); + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + if (warp_id == 0) + { + prefix_op(tile_aggregate); + if (threadIdx.x == 0) + temp_storage.tile_exclusive = prefix_op.exclusive_prefix; + } + + CTA_SYNC(); + + LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive; + + // Update thread_exclusive_in_warp to fold in warp and tile run-lengths + LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile); + if (thread_exclusive_in_warp.key == 0) + thread_exclusive_in_warp.value += thread_exclusive.value; + + // Downsweep scan through lengths_and_num_runs + LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; + LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; + OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; + + internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); + + // Zip + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; + lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? + lengths_and_num_runs2[ITEM].key : // keep + WARP_THREADS * ITEMS_PER_THREAD; // discard + } + + OffsetT tile_num_runs_aggregate = tile_aggregate.key; + OffsetT tile_num_runs_exclusive_in_global = tile_exclusive_in_global.key; + OffsetT warp_num_runs_aggregate = warp_aggregate.key; + OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; + + // Scatter + Scatter( + tile_num_runs_aggregate, + tile_num_runs_exclusive_in_global, + warp_num_runs_aggregate, + warp_num_runs_exclusive_in_tile, + thread_num_runs_exclusive_in_warp, + lengths_and_offsets); + + // Return running total (inclusive of this tile) + return prefix_op.inclusive_prefix; + } + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + template ///< Output iterator type for recording number of items selected + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + ScanTileStateT& tile_status, ///< Global list of tile status + NumRunsIteratorT d_num_runs_out) ///< Output pointer for total number of runs identified + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (tile_idx < num_tiles - 1) + { + // Not the last tile (full) + ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); + } + else if (num_remaining > 0) + { + // The last tile (possibly partially-full) + LengthOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); + + if (threadIdx.x == 0) + { + // Output the total number of items selected + *d_num_runs_out = running_total.key; + + // The inclusive prefix contains accumulated length reduction for the last run + if (running_total.key > 0) + d_lengths_out[running_total.key - 1] = running_total.value; + } + } + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_scan.cuh new file mode 100644 index 0000000..567df80 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_scan.cuh @@ -0,0 +1,471 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan . + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentScan + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentScanPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan . + */ +template < + typename AgentScanPolicyT, ///< Parameterized AgentScanPolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type + typename OutputIteratorT, ///< Random-access output iterator type + typename ScanOpT, ///< Scan functor type + typename InitValueT, ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan) + typename OffsetT> ///< Signed integer type for global offsets +struct AgentScan +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + // Input iterator wrapper type (for applying cache modifier) + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + // Constants + enum + { + IS_INCLUSIVE = Equals::VALUE, // Inclusive scan if no init_value type is provided + BLOCK_THREADS = AgentScanPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Parameterized BlockLoad type + typedef BlockLoad< + OutputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::ITEMS_PER_THREAD, + AgentScanPolicyT::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockStore type + typedef BlockStore< + OutputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::ITEMS_PER_THREAD, + AgentScanPolicyT::STORE_ALGORITHM> + BlockStoreT; + + // Parameterized BlockScan type + typedef BlockScan< + OutputT, + AgentScanPolicyT::BLOCK_THREADS, + AgentScanPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + OutputT, + ScanOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles + typedef BlockScanRunningPrefixOp< + OutputT, + ScanOpT> + RunningPrefixCallbackOp; + + // Shared memory type for this thread block + union _TempStorage + { + typename BlockLoadT::TempStorage load; // Smem needed for tile loading + typename BlockStoreT::TempStorage store; // Smem needed for tile storing + + struct + { + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + }; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedInputIteratorT d_in; ///< Input data + OutputIteratorT d_out; ///< Output data + ScanOpT scan_op; ///< Binary scan operator + InitValueT init_value; ///< The init_value element for ScanOpT + + + //--------------------------------------------------------------------- + // Block scan utility methods + //--------------------------------------------------------------------- + + /** + * Exclusive scan specialization (first tile) + */ + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + OutputT init_value, + ScanOpT scan_op, + OutputT &block_aggregate, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate); + block_aggregate = scan_op(init_value, block_aggregate); + } + + + /** + * Inclusive scan specialization (first tile) + */ + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + InitValueT /*init_value*/, + ScanOpT scan_op, + OutputT &block_aggregate, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); + } + + + /** + * Exclusive scan specialization (subsequent tiles) + */ + template + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + ScanOpT scan_op, + PrefixCallback &prefix_op, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op); + } + + + /** + * Inclusive scan specialization (subsequent tiles) + */ + template + __device__ __forceinline__ + void ScanTile( + OutputT (&items)[ITEMS_PER_THREAD], + ScanOpT scan_op, + PrefixCallback &prefix_op, + Int2Type /*is_inclusive*/) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op); + } + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentScan( + TempStorage& temp_storage, ///< Reference to temp_storage + InputIteratorT d_in, ///< Input data + OutputIteratorT d_out, ///< Output data + ScanOpT scan_op, ///< Binary scan operator + InitValueT init_value) ///< Initial value to seed the exclusive scan + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_out(d_out), + scan_op(scan_op), + init_value(init_value) + {} + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic chained scan) + */ + template ///< Whether the current tile is the last tile + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + // Load items + OutputT items[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining); + else + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); + + CTA_SYNC(); + + // Perform tile scan + if (tile_idx == 0) + { + // Scan first tile + OutputT block_aggregate; + ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); + if ((!IS_LAST_TILE) && (threadIdx.x == 0)) + tile_state.SetInclusive(0, block_aggregate); + } + else + { + // Scan non-first tile + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); + ScanTile(items, scan_op, prefix_op, Int2Type()); + } + + CTA_SYNC(); + + // Store items + if (IS_LAST_TILE) + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining); + else + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + ScanTileStateT& tile_state, ///< Global tile state descriptor + int start_tile) ///< The starting tile for the current grid + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = start_tile + blockIdx.x; // Current tile index + OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Not last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + else if (num_remaining > 0) + { + // Last tile + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + } + } + + + //--------------------------------------------------------------------- + // Scan an sequence of consecutive tiles (independent of other thread blocks) + //--------------------------------------------------------------------- + + /** + * Process a tile of input + */ + template < + bool IS_FIRST_TILE, + bool IS_LAST_TILE> + __device__ __forceinline__ void ConsumeTile( + OffsetT tile_offset, ///< Tile offset + RunningPrefixCallbackOp& prefix_op, ///< Running prefix operator + int valid_items = TILE_ITEMS) ///< Number of valid items in the tile + { + // Load items + OutputT items[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items); + else + BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); + + CTA_SYNC(); + + // Block scan + if (IS_FIRST_TILE) + { + OutputT block_aggregate; + ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); + prefix_op.running_total = block_aggregate; + } + else + { + ScanTile(items, scan_op, prefix_op, Int2Type()); + } + + CTA_SYNC(); + + // Store items + if (IS_LAST_TILE) + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items); + else + BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); + } + + + /** + * Scan a consecutive share of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + OffsetT range_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT range_end) ///< [in] Threadblock end offset (exclusive) + { + BlockScanRunningPrefixOp prefix_op(scan_op); + + if (range_offset + TILE_ITEMS <= range_end) + { + // Consume first tile of input (full) + ConsumeTile(range_offset, prefix_op); + range_offset += TILE_ITEMS; + + // Consume subsequent full tiles of input + while (range_offset + TILE_ITEMS <= range_end) + { + ConsumeTile(range_offset, prefix_op); + range_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (range_offset < range_end) + { + int valid_items = range_end - range_offset; + ConsumeTile(range_offset, prefix_op, valid_items); + } + } + else + { + // Consume the first tile of input (partially-full) + int valid_items = range_end - range_offset; + ConsumeTile(range_offset, prefix_op, valid_items); + } + } + + + /** + * Scan a consecutive share of input tiles, seeded with the specified prefix value + */ + __device__ __forceinline__ void ConsumeRange( + OffsetT range_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT range_end, ///< [in] Threadblock end offset (exclusive) + OutputT prefix) ///< [in] The prefix to apply to the scan segment + { + BlockScanRunningPrefixOp prefix_op(prefix, scan_op); + + // Consume full tiles of input + while (range_offset + TILE_ITEMS <= range_end) + { + ConsumeTile(range_offset, prefix_op); + range_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (range_offset < range_end) + { + int valid_items = range_end - range_offset; + ConsumeTile(range_offset, prefix_op, valid_items); + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_segment_fixup.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_segment_fixup.cuh new file mode 100644 index 0000000..efa6d86 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_segment_fixup.cuh @@ -0,0 +1,375 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_discontinuity.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/constant_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentSegmentFixup + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentSegmentFixupPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key + */ +template < + typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type + typename PairsInputIteratorT, ///< Random-access input iterator type for keys + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentSegmentFixup +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Data type of key-value input iterator + typedef typename std::iterator_traits::value_type KeyValuePairT; + + // Value type + typedef typename KeyValuePairT::Value ValueT; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Constants + enum + { + BLOCK_THREADS = AgentSegmentFixupPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Whether or not do fixup using RLE + global atomics + USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) && + (Equals::VALUE || + Equals::VALUE || + Equals::VALUE || + Equals::VALUE), + + // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) + HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), + }; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + PairsInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedPairsInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + AggregatesOutputIteratorT>::Type // Directly use the supplied input iterator type + WrappedFixupInputIteratorT; + + // Reduce-value-by-segment scan operator + typedef ReduceByKeyOp ReduceBySegmentOpT; + + // Parameterized BlockLoad type for pairs + typedef BlockLoad< + KeyValuePairT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentSegmentFixupPolicyT::LOAD_ALGORITHM> + BlockLoadPairs; + + // Parameterized BlockScan type + typedef BlockScan< + KeyValuePairT, + BLOCK_THREADS, + AgentSegmentFixupPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + KeyValuePairT, + ReduceBySegmentOpT, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Shared memory type for this thread block + union _TempStorage + { + struct + { + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + }; + + // Smem needed for loading keys + typename BlockLoadPairs::TempStorage load_pairs; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedPairsInputIteratorT d_pairs_in; ///< Input keys + AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates + WrappedFixupInputIteratorT d_fixup_in; ///< Fixup input values + InequalityWrapper inequality_op; ///< KeyT inequality operator + ReductionOpT reduction_op; ///< Reduction operator + ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentSegmentFixup( + TempStorage& temp_storage, ///< Reference to temp_storage + PairsInputIteratorT d_pairs_in, ///< Input keys + AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates + EqualityOpT equality_op, ///< KeyT equality operator + ReductionOpT reduction_op) ///< ValueT reduction operator + : + temp_storage(temp_storage.Alias()), + d_pairs_in(d_pairs_in), + d_aggregates_out(d_aggregates_out), + d_fixup_in(d_aggregates_out), + inequality_op(equality_op), + reduction_op(reduction_op), + scan_op(reduction_op) + {} + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + + /** + * Process input tile. Specialized for atomic-fixup + */ + template + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state, ///< Global tile state descriptor + Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) + { + KeyValuePairT pairs[ITEMS_PER_THREAD]; + + // Load pairs + KeyValuePairT oob_pair; + oob_pair.key = -1; + + if (IS_LAST_TILE) + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); + else + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); + + // RLE + #pragma unroll + for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key; + if (pairs[ITEM].key != pairs[ITEM - 1].key) + atomicAdd(d_scatter, pairs[ITEM - 1].value); + else + pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value); + } + + // Flush last item if valid + ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key; + if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0)) + atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value); + } + + + /** + * Process input tile. Specialized for reduce-by-key fixup + */ + template + __device__ __forceinline__ void ConsumeTile( + OffsetT num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state, ///< Global tile state descriptor + Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) + { + KeyValuePairT pairs[ITEMS_PER_THREAD]; + KeyValuePairT scatter_pairs[ITEMS_PER_THREAD]; + + // Load pairs + KeyValuePairT oob_pair; + oob_pair.key = -1; + + if (IS_LAST_TILE) + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); + else + BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); + + CTA_SYNC(); + + KeyValuePairT tile_aggregate; + if (tile_idx == 0) + { + // Exclusive scan of values and segment_flags + BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate); + + // Update tile status if this is not the last tile + if (threadIdx.x == 0) + { + // Set first segment id to not trigger a flush (invalid from exclusive scan) + scatter_pairs[0].key = pairs[0].key; + + if (!IS_LAST_TILE) + tile_state.SetInclusive(0, tile_aggregate); + + } + } + else + { + // Exclusive scan of values and segment_flags + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); + BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op); + tile_aggregate = prefix_op.GetBlockAggregate(); + } + + // Scatter updated values + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (scatter_pairs[ITEM].key != pairs[ITEM].key) + { + // Update the value at the key location + ValueT value = d_fixup_in[scatter_pairs[ITEM].key]; + value = reduction_op(value, scatter_pairs[ITEM].value); + + d_aggregates_out[scatter_pairs[ITEM].key] = value; + } + } + + // Finalize the last item + if (IS_LAST_TILE) + { + // Last thread will output final count and last item, if necessary + if (threadIdx.x == BLOCK_THREADS - 1) + { + // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment + if (num_remaining == TILE_ITEMS) + { + // Update the value at the key location + OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key; + d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]); + } + } + } + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + int num_tiles, ///< Total number of input tiles + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Not the last tile (full) + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); + } + else if (num_remaining > 0) + { + // The last tile (possibly partially-full) + ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_select_if.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_select_if.cuh new file mode 100644 index 0000000..f365481 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_select_if.cuh @@ -0,0 +1,703 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select. + */ + +#pragma once + +#include + +#include "single_pass_scan_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../block/block_discontinuity.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentSelectIf + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentSelectIfPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + + +/** + * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection + * + * Performs functor-based selection if SelectOpT functor type != NullType + * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for selection items + typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename SelectedOutputIteratorT, ///< Random-access input iterator type for selection_flags items + typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) + typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) + typename OffsetT, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct AgentSelectIf +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // The flag value type + typedef typename std::iterator_traits::value_type FlagT; + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + // Constants + enum + { + USE_SELECT_OP, + USE_SELECT_FLAGS, + USE_DISCONTINUITY, + + BLOCK_THREADS = AgentSelectIfPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentSelectIfPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), + + SELECT_METHOD = (!Equals::VALUE) ? + USE_SELECT_OP : + (!Equals::VALUE) ? + USE_SELECT_FLAGS : + USE_DISCONTINUITY + }; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for items + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + InputIteratorT>::Type // Directly use the supplied input iterator type + WrappedInputIteratorT; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + FlagsInputIteratorT>::Type // Directly use the supplied input iterator type + WrappedFlagsInputIteratorT; + + // Parameterized BlockLoad type for input data + typedef BlockLoad< + OutputT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentSelectIfPolicyT::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockLoad type for flags + typedef BlockLoad< + FlagT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentSelectIfPolicyT::LOAD_ALGORITHM> + BlockLoadFlags; + + // Parameterized BlockDiscontinuity type for items + typedef BlockDiscontinuity< + OutputT, + BLOCK_THREADS> + BlockDiscontinuityT; + + // Parameterized BlockScan type + typedef BlockScan< + OffsetT, + BLOCK_THREADS, + AgentSelectIfPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef TilePrefixCallbackOp< + OffsetT, + cub::Sum, + ScanTileStateT> + TilePrefixCallbackOpT; + + // Item exchange type + typedef OutputT ItemExchangeT[TILE_ITEMS]; + + // Shared memory type for this thread block + union _TempStorage + { + struct + { + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection + }; + + // Smem needed for loading items + typename BlockLoadT::TempStorage load_items; + + // Smem needed for loading values + typename BlockLoadFlags::TempStorage load_flags; + + // Smem needed for compacting items (allows non POD items in this union) + Uninitialized raw_exchange; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; ///< Reference to temp_storage + WrappedInputIteratorT d_in; ///< Input items + SelectedOutputIteratorT d_selected_out; ///< Unique output items + WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable) + InequalityWrapper inequality_op; ///< T inequality operator + SelectOpT select_op; ///< Selection operator + OffsetT num_items; ///< Total number of input items + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentSelectIf( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIteratorT d_in, ///< Input data + FlagsInputIteratorT d_flags_in, ///< Input selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< Output data + SelectOpT select_op, ///< Selection operator + EqualityOpT equality_op, ///< Equality operator + OffsetT num_items) ///< Total number of input items + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_flags_in(d_flags_in), + d_selected_out(d_selected_out), + select_op(select_op), + inequality_op(equality_op), + num_items(num_items) + {} + + + //--------------------------------------------------------------------- + // Utility methods for initializing the selections + //--------------------------------------------------------------------- + + /** + * Initialize selections (specialized for selection operator) + */ + template + __device__ __forceinline__ void InitializeSelections( + OffsetT /*tile_offset*/, + OffsetT num_tile_items, + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + Int2Type /*select_method*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Out-of-bounds items are selection_flags + selection_flags[ITEM] = 1; + + if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items)) + selection_flags[ITEM] = select_op(items[ITEM]); + } + } + + + /** + * Initialize selections (specialized for valid flags) + */ + template + __device__ __forceinline__ void InitializeSelections( + OffsetT tile_offset, + OffsetT num_tile_items, + OutputT (&/*items*/)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + Int2Type /*select_method*/) + { + CTA_SYNC(); + + FlagT flags[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + { + // Out-of-bounds items are selection_flags + BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1); + } + else + { + BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags); + } + + // Convert flag type to selection_flags type + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + selection_flags[ITEM] = flags[ITEM]; + } + } + + + /** + * Initialize selections (specialized for discontinuity detection) + */ + template + __device__ __forceinline__ void InitializeSelections( + OffsetT tile_offset, + OffsetT num_tile_items, + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + Int2Type /*select_method*/) + { + if (IS_FIRST_TILE) + { + CTA_SYNC(); + + // Set head selection_flags. First tile sets the first flag for the first item + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op); + } + else + { + OutputT tile_predecessor; + if (threadIdx.x == 0) + tile_predecessor = d_in[tile_offset - 1]; + + CTA_SYNC(); + + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor); + } + + // Set selection flags for out-of-bounds items + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Set selection_flags for out-of-bounds items + if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) + selection_flags[ITEM] = 1; + } + } + + + //--------------------------------------------------------------------- + // Scatter utility methods + //--------------------------------------------------------------------- + + /** + * Scatter flagged items to output offsets (specialized for direct scattering) + */ + template + __device__ __forceinline__ void ScatterDirect( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + OffsetT num_selections) + { + // Scatter flagged items + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (selection_flags[ITEM]) + { + if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections) + { + d_selected_out[selection_indices[ITEM]] = items[ITEM]; + } + } + } + } + + + /** + * Scatter flagged items to output offsets (specialized for two-phase scattering) + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int /*num_tile_items*/, ///< Number of valid items in this tile + int num_tile_selections, ///< Number of selections in this tile + OffsetT num_selections_prefix, ///< Total number of selections prior to this tile + OffsetT /*num_rejected_prefix*/, ///< Total number of rejections prior to this tile + Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition + { + CTA_SYNC(); + + // Compact and scatter items + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix; + if (selection_flags[ITEM]) + { + temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; + } + } + + CTA_SYNC(); + + for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) + { + d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item]; + } + } + + + /** + * Scatter flagged items to output offsets (specialized for two-phase scattering) + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int num_tile_items, ///< Number of valid items in this tile + int num_tile_selections, ///< Number of selections in this tile + OffsetT num_selections_prefix, ///< Total number of selections prior to this tile + OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile + Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition + { + CTA_SYNC(); + + int tile_num_rejections = num_tile_items - num_tile_selections; + + // Scatter items to shared memory (rejections first) + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + int local_selection_idx = selection_indices[ITEM] - num_selections_prefix; + int local_rejection_idx = item_idx - local_selection_idx; + int local_scatter_offset = (selection_flags[ITEM]) ? + tile_num_rejections + local_selection_idx : + local_rejection_idx; + + temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; + } + + CTA_SYNC(); + + // Gather items from shared memory and scatter to global + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; + int rejection_idx = item_idx; + int selection_idx = item_idx - tile_num_rejections; + OffsetT scatter_offset = (item_idx < tile_num_rejections) ? + num_items - num_rejected_prefix - rejection_idx - 1 : + num_selections_prefix + selection_idx; + + OutputT item = temp_storage.raw_exchange.Alias()[item_idx]; + + if (!IS_LAST_TILE || (item_idx < num_tile_items)) + { + d_selected_out[scatter_offset] = item; + } + } + } + + + /** + * Scatter flagged items + */ + template + __device__ __forceinline__ void Scatter( + OutputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int num_tile_items, ///< Number of valid items in this tile + int num_tile_selections, ///< Number of selections in this tile + OffsetT num_selections_prefix, ///< Total number of selections prior to this tile + OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile + OffsetT num_selections) ///< Total number of selections including this tile + { + // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one + if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS))) + { + ScatterTwoPhase( + items, + selection_flags, + selection_indices, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_rejected_prefix, + Int2Type()); + } + else + { + ScatterDirect( + items, + selection_flags, + selection_indices, + num_selections); + } + } + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + + /** + * Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + */ + template + __device__ __forceinline__ OffsetT ConsumeFirstTile( + int num_tile_items, ///< Number of input items comprising this tile + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + OutputT items[ITEMS_PER_THREAD]; + OffsetT selection_flags[ITEMS_PER_THREAD]; + OffsetT selection_indices[ITEMS_PER_THREAD]; + + // Load items + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + else + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + + // Initialize selection_flags + InitializeSelections( + tile_offset, + num_tile_items, + items, + selection_flags, + Int2Type()); + + CTA_SYNC(); + + // Exclusive scan of selection_flags + OffsetT num_tile_selections; + BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections); + + if (threadIdx.x == 0) + { + // Update tile status if this is not the last tile + if (!IS_LAST_TILE) + tile_state.SetInclusive(0, num_tile_selections); + } + + // Discount any out-of-bounds selections + if (IS_LAST_TILE) + num_tile_selections -= (TILE_ITEMS - num_tile_items); + + // Scatter flagged items + Scatter( + items, + selection_flags, + selection_indices, + num_tile_items, + num_tile_selections, + 0, + 0, + num_tile_selections); + + return num_tile_selections; + } + + + /** + * Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + */ + template + __device__ __forceinline__ OffsetT ConsumeSubsequentTile( + int num_tile_items, ///< Number of input items comprising this tile + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + OutputT items[ITEMS_PER_THREAD]; + OffsetT selection_flags[ITEMS_PER_THREAD]; + OffsetT selection_indices[ITEMS_PER_THREAD]; + + // Load items + if (IS_LAST_TILE) + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); + else + BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); + + // Initialize selection_flags + InitializeSelections( + tile_offset, + num_tile_items, + items, + selection_flags, + Int2Type()); + + CTA_SYNC(); + + // Exclusive scan of values and selection_flags + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx); + BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op); + + OffsetT num_tile_selections = prefix_op.GetBlockAggregate(); + OffsetT num_selections = prefix_op.GetInclusivePrefix(); + OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix(); + OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - num_selections_prefix; + + // Discount any out-of-bounds selections + if (IS_LAST_TILE) + { + int num_discount = TILE_ITEMS - num_tile_items; + num_selections -= num_discount; + num_tile_selections -= num_discount; + } + + // Scatter flagged items + Scatter( + items, + selection_flags, + selection_indices, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_rejected_prefix, + num_selections); + + return num_selections; + } + + + /** + * Process a tile of input + */ + template + __device__ __forceinline__ OffsetT ConsumeTile( + int num_tile_items, ///< Number of input items comprising this tile + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + OffsetT num_selections; + if (tile_idx == 0) + { + num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state); + } + else + { + num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state); + } + + return num_selections; + } + + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + template ///< Output iterator type for recording number of items selection_flags + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + ScanTileStateT& tile_state, ///< Global tile state descriptor + NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile + + if (tile_idx < num_tiles - 1) + { + // Not the last tile (full) + ConsumeTile(TILE_ITEMS, tile_idx, tile_offset, tile_state); + } + else + { + // The last tile (possibly partially-full) + OffsetT num_remaining = num_items - tile_offset; + OffsetT num_selections = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); + + if (threadIdx.x == 0) + { + // Output the total number of items selection_flags + *d_num_selected_out = num_selections; + } + } + } + +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_spmv_orig.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_spmv_orig.cuh new file mode 100644 index 0000000..3168554 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_spmv_orig.cuh @@ -0,0 +1,925 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. + */ + +#pragma once + +#include + +#include "../util_type.cuh" +#include "../block/block_reduce.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../thread/thread_search.cuh" +#include "../thread/thread_operators.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/counting_input_iterator.cuh" +#include "../iterator/tex_ref_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentSpmv + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets during search + CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets + CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR column-indices + CacheLoadModifier _VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR values + CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading vector values + bool _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory) + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct AgentSpmvPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory) + }; + + static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets + static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets + static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR column-indices + static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR values + static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading vector values + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use + +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +template < + typename ValueT, ///< Matrix and vector value type + typename OffsetT> ///< Signed integer type for sequence offsets +struct SpmvParams +{ + ValueT* d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + OffsetT* d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values + OffsetT* d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + ValueT* d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector x + ValueT* d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector y + int num_rows; ///< Number of rows of matrix A. + int num_cols; ///< Number of columns of matrix A. + int num_nonzeros; ///< Number of nonzero elements of matrix A. + ValueT alpha; ///< Alpha multiplicand + ValueT beta; ///< Beta addend-multiplicand + + TexRefInputIterator t_vector_x; +}; + + +/** + * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. + */ +template < + typename AgentSpmvPolicyT, ///< Parameterized AgentSpmvPolicy tuning policy type + typename ValueT, ///< Matrix and vector value type + typename OffsetT, ///< Signed integer type for sequence offsets + bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1 + bool HAS_BETA, ///< Whether the input parameter \p beta is 0 + int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability +struct AgentSpmv +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + /// 2D merge path coordinate type + typedef typename CubVector::Type CoordinateT; + + /// Input iterator wrapper types (for applying cache modifiers) + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, + OffsetT, + OffsetT> + RowOffsetsSearchIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER, + OffsetT, + OffsetT> + RowOffsetsIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER, + OffsetT, + OffsetT> + ColumnIndicesIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::VALUES_LOAD_MODIFIER, + ValueT, + OffsetT> + ValueIteratorT; + + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, + ValueT, + OffsetT> + VectorValueIteratorT; + + // Tuple type for scanning (pairs accumulated segment-value with segment-index) + typedef KeyValuePair KeyValuePairT; + + // Reduce-value-by-segment scan operator + typedef ReduceByKeyOp ReduceBySegmentOpT; + + // BlockReduce specialization + typedef BlockReduce< + ValueT, + BLOCK_THREADS, + BLOCK_REDUCE_WARP_REDUCTIONS> + BlockReduceT; + + // BlockScan specialization + typedef BlockScan< + KeyValuePairT, + BLOCK_THREADS, + AgentSpmvPolicyT::SCAN_ALGORITHM> + BlockScanT; + + // BlockScan specialization + typedef BlockScan< + ValueT, + BLOCK_THREADS, + AgentSpmvPolicyT::SCAN_ALGORITHM> + BlockPrefixSumT; + + // BlockExchange specialization + typedef BlockExchange< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD> + BlockExchangeT; + + /// Merge item type (either a non-zero value or a row-end offset) + union MergeItem + { + // Value type to pair with index type OffsetT (NullType if loading values directly during merge) + typedef typename If::Type MergeValueT; + + OffsetT row_end_offset; + MergeValueT nonzero; + }; + + /// Shared memory type required by this thread block + struct _TempStorage + { + CoordinateT tile_coords[2]; + + union Aliasable + { + // Smem needed for tile of merge items + MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1]; + + // Smem needed for block exchange + typename BlockExchangeT::TempStorage exchange; + + // Smem needed for block-wide reduction + typename BlockReduceT::TempStorage reduce; + + // Smem needed for tile scanning + typename BlockScanT::TempStorage scan; + + // Smem needed for tile prefix sum + typename BlockPrefixSumT::TempStorage prefix_sum; + + } aliasable; + }; + + /// Temporary storage type (unionable) + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + + _TempStorage& temp_storage; /// Reference to temp_storage + + SpmvParams& spmv_params; + + ValueIteratorT wd_values; ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + RowOffsetsIteratorT wd_row_end_offsets; ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values + ColumnIndicesIteratorT wd_column_indices; ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + VectorValueIteratorT wd_vector_x; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x + VectorValueIteratorT wd_vector_y; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ AgentSpmv( + TempStorage& temp_storage, ///< Reference to temp_storage + SpmvParams& spmv_params) ///< SpMV input parameter bundle + : + temp_storage(temp_storage.Alias()), + spmv_params(spmv_params), + wd_values(spmv_params.d_values), + wd_row_end_offsets(spmv_params.d_row_end_offsets), + wd_column_indices(spmv_params.d_column_indices), + wd_vector_x(spmv_params.d_vector_x), + wd_vector_y(spmv_params.d_vector_y) + {} + + + + + /** + * Consume a merge tile, specialized for direct-load of nonzeros + */ + __device__ __forceinline__ KeyValuePairT ConsumeTile( + int tile_idx, + CoordinateT tile_start_coord, + CoordinateT tile_end_coord, + Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch + { + int tile_num_rows = tile_end_coord.x - tile_start_coord.x; + int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; + OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; + + // Gather the row end-offsets for the merge tile into shared memory + for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) + { + s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; + } + + CTA_SYNC(); + + // Search for the thread's starting coordinate within the merge tile + CountingInputIterator tile_nonzero_indices(tile_start_coord.y); + CoordinateT thread_start_coord; + + MergePathSearch( + OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal + s_tile_row_end_offsets, // List A + tile_nonzero_indices, // List B + tile_num_rows, + tile_num_nonzeros, + thread_start_coord); + + CTA_SYNC(); // Perf-sync + + // Compute the thread's merge path segment + CoordinateT thread_current_coord = thread_start_coord; + KeyValuePairT scan_segment[ITEMS_PER_THREAD]; + + ValueT running_total = 0.0; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1); + OffsetT column_idx = wd_column_indices[nonzero_idx]; + ValueT value = wd_values[nonzero_idx]; + + ValueT vector_value = spmv_params.t_vector_x[column_idx]; +#if (CUB_PTX_ARCH >= 350) + vector_value = wd_vector_x[column_idx]; +#endif + ValueT nonzero = value * vector_value; + + OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; + + if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) + { + // Move down (accumulate) + running_total += nonzero; + scan_segment[ITEM].value = running_total; + scan_segment[ITEM].key = tile_num_rows; + ++thread_current_coord.y; + } + else + { + // Move right (reset) + scan_segment[ITEM].value = running_total; + scan_segment[ITEM].key = thread_current_coord.x; + running_total = 0.0; + ++thread_current_coord.x; + } + } + + CTA_SYNC(); + + // Block-wide reduce-value-by-segment + KeyValuePairT tile_carry; + ReduceBySegmentOpT scan_op; + KeyValuePairT scan_item; + + scan_item.value = running_total; + scan_item.key = thread_current_coord.x; + + BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); + + if (tile_num_rows > 0) + { + if (threadIdx.x == 0) + scan_item.key = -1; + + // Direct scatter + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (scan_segment[ITEM].key < tile_num_rows) + { + if (scan_item.key == scan_segment[ITEM].key) + scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value; + + if (HAS_ALPHA) + { + scan_segment[ITEM].value *= spmv_params.alpha; + } + + if (HAS_BETA) + { + // Update the output vector element + ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key]; + scan_segment[ITEM].value += addend; + } + + // Set the output vector element + spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value; + } + } + } + + // Return the tile's running carry-out + return tile_carry; + } + + + + /** + * Consume a merge tile, specialized for indirect load of nonzeros + */ + __device__ __forceinline__ KeyValuePairT ConsumeTile( + int tile_idx, + CoordinateT tile_start_coord, + CoordinateT tile_end_coord, + Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch + { + int tile_num_rows = tile_end_coord.x - tile_start_coord.x; + int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; + +#if (CUB_PTX_ARCH >= 520) + +/* + OffsetT* s_tile_row_end_offsets = &temp_storage.merge_items[tile_num_nonzeros].row_end_offset; + ValueT* s_tile_nonzeros = &temp_storage.merge_items[0].nonzero; + + OffsetT col_indices[ITEMS_PER_THREAD]; + ValueT mat_values[ITEMS_PER_THREAD]; + int nonzero_indices[ITEMS_PER_THREAD]; + + // Gather the nonzeros for the merge tile into shared memory + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + nonzero_indices[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS); + + ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_indices[ITEM]; + ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_indices[ITEM]; + + col_indices[ITEM] = (nonzero_indices[ITEM] < tile_num_nonzeros) ? *ci : 0; + mat_values[ITEM] = (nonzero_indices[ITEM] < tile_num_nonzeros) ? *a : 0.0; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + VectorValueIteratorT x = wd_vector_x + col_indices[ITEM]; + mat_values[ITEM] *= *x; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + ValueT *s = s_tile_nonzeros + nonzero_indices[ITEM]; + + *s = mat_values[ITEM]; + } + + CTA_SYNC(); + +*/ + + OffsetT* s_tile_row_end_offsets = &temp_storage.merge_items[0].row_end_offset; + ValueT* s_tile_nonzeros = &temp_storage.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; + + // Gather the nonzeros for the merge tile into shared memory + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); + + ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx; + ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx; + ValueT* s = s_tile_nonzeros + nonzero_idx; + + if (nonzero_idx < tile_num_nonzeros) + { + + OffsetT column_idx = *ci; + ValueT value = *a; + + ValueT vector_value = spmv_params.t_vector_x[column_idx]; + vector_value = wd_vector_x[column_idx]; + + ValueT nonzero = value * vector_value; + + *s = nonzero; + } + } + + +#else + + OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; + ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; + + // Gather the nonzeros for the merge tile into shared memory + if (tile_num_nonzeros > 0) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); + nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); + + OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; + ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; + + ValueT vector_value = spmv_params.t_vector_x[column_idx]; +#if (CUB_PTX_ARCH >= 350) + vector_value = wd_vector_x[column_idx]; +#endif + ValueT nonzero = value * vector_value; + + s_tile_nonzeros[nonzero_idx] = nonzero; + } + } + +#endif + + // Gather the row end-offsets for the merge tile into shared memory + #pragma unroll 1 + for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) + { + s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; + } + + CTA_SYNC(); + + // Search for the thread's starting coordinate within the merge tile + CountingInputIterator tile_nonzero_indices(tile_start_coord.y); + CoordinateT thread_start_coord; + + MergePathSearch( + OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal + s_tile_row_end_offsets, // List A + tile_nonzero_indices, // List B + tile_num_rows, + tile_num_nonzeros, + thread_start_coord); + + CTA_SYNC(); // Perf-sync + + // Compute the thread's merge path segment + CoordinateT thread_current_coord = thread_start_coord; + KeyValuePairT scan_segment[ITEMS_PER_THREAD]; + ValueT running_total = 0.0; + + OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; + ValueT nonzero = s_tile_nonzeros[thread_current_coord.y]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) + { + // Move down (accumulate) + scan_segment[ITEM].value = nonzero; + running_total += nonzero; + ++thread_current_coord.y; + nonzero = s_tile_nonzeros[thread_current_coord.y]; + } + else + { + // Move right (reset) + scan_segment[ITEM].value = 0.0; + running_total = 0.0; + ++thread_current_coord.x; + row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; + } + + scan_segment[ITEM].key = thread_current_coord.x; + } + + CTA_SYNC(); + + // Block-wide reduce-value-by-segment + KeyValuePairT tile_carry; + ReduceBySegmentOpT scan_op; + KeyValuePairT scan_item; + + scan_item.value = running_total; + scan_item.key = thread_current_coord.x; + + BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); + + if (threadIdx.x == 0) + { + scan_item.key = thread_start_coord.x; + scan_item.value = 0.0; + } + + if (tile_num_rows > 0) + { + + CTA_SYNC(); + + // Scan downsweep and scatter + ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero; + + if (scan_item.key != scan_segment[0].key) + { + s_partials[scan_item.key] = scan_item.value; + } + else + { + scan_segment[0].value += scan_item.value; + } + + #pragma unroll + for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key) + { + s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value; + } + else + { + scan_segment[ITEM].value += scan_segment[ITEM - 1].value; + } + } + + CTA_SYNC(); + + #pragma unroll 1 + for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) + { + spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item]; + } + } + + // Return the tile's running carry-out + return tile_carry; + } + + + + + + + + /** + * Consume a merge tile, specialized for indirect load of nonzeros + * / + template + __device__ __forceinline__ KeyValuePairT ConsumeTile1( + int tile_idx, + CoordinateT tile_start_coord, + CoordinateT tile_end_coord, + IsDirectLoadT is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch + { + int tile_num_rows = tile_end_coord.x - tile_start_coord.x; + int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; + + OffsetT* s_tile_row_end_offsets = &temp_storage.merge_items[0].row_end_offset; + + int warp_idx = threadIdx.x / WARP_THREADS; + int lane_idx = LaneId(); + + // Gather the row end-offsets for the merge tile into shared memory + #pragma unroll 1 + for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) + { + s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; + } + + CTA_SYNC(); + + // Search for warp start/end coords + if (lane_idx == 0) + { + MergePathSearch( + OffsetT(warp_idx * ITEMS_PER_WARP), // Diagonal + s_tile_row_end_offsets, // List A + CountingInputIterator(tile_start_coord.y), // List B + tile_num_rows, + tile_num_nonzeros, + temp_storage.warp_coords[warp_idx]); + + CoordinateT last = {tile_num_rows, tile_num_nonzeros}; + temp_storage.warp_coords[WARPS] = last; + } + + CTA_SYNC(); + + CoordinateT warp_coord = temp_storage.warp_coords[warp_idx]; + CoordinateT warp_end_coord = temp_storage.warp_coords[warp_idx + 1]; + OffsetT warp_nonzero_idx = tile_start_coord.y + warp_coord.y; + + // Consume whole rows + #pragma unroll 1 + for (; warp_coord.x < warp_end_coord.x; ++warp_coord.x) + { + ValueT row_total = 0.0; + OffsetT row_end_offset = s_tile_row_end_offsets[warp_coord.x]; + + #pragma unroll 1 + for (OffsetT nonzero_idx = warp_nonzero_idx + lane_idx; + nonzero_idx < row_end_offset; + nonzero_idx += WARP_THREADS) + { + OffsetT column_idx = wd_column_indices[nonzero_idx]; + ValueT value = wd_values[nonzero_idx]; + ValueT vector_value = wd_vector_x[column_idx]; + row_total += value * vector_value; + } + + // Warp reduce + row_total = WarpReduceT(temp_storage.warp_reduce[warp_idx]).Sum(row_total); + + // Output + if (lane_idx == 0) + { + spmv_params.d_vector_y[tile_start_coord.x + warp_coord.x] = row_total; + } + + warp_nonzero_idx = row_end_offset; + } + + // Consume partial portion of thread's last row + if (warp_nonzero_idx < tile_start_coord.y + warp_end_coord.y) + { + ValueT row_total = 0.0; + for (OffsetT nonzero_idx = warp_nonzero_idx + lane_idx; + nonzero_idx < tile_start_coord.y + warp_end_coord.y; + nonzero_idx += WARP_THREADS) + { + + OffsetT column_idx = wd_column_indices[nonzero_idx]; + ValueT value = wd_values[nonzero_idx]; + ValueT vector_value = wd_vector_x[column_idx]; + row_total += value * vector_value; + } + + // Warp reduce + row_total = WarpReduceT(temp_storage.warp_reduce[warp_idx]).Sum(row_total); + + // Output + if (lane_idx == 0) + { + spmv_params.d_vector_y[tile_start_coord.x + warp_coord.x] = row_total; + } + } + + // Return the tile's running carry-out + KeyValuePairT tile_carry(tile_num_rows, 0.0); + return tile_carry; + } +*/ + + + + + + + + /** + * Consume a merge tile, specialized for indirect load of nonzeros + * / + __device__ __forceinline__ KeyValuePairT ConsumeTile2( + int tile_idx, + CoordinateT tile_start_coord, + CoordinateT tile_end_coord, + Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch + { + int tile_num_rows = tile_end_coord.x - tile_start_coord.x; + int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; + + ValueT* s_tile_nonzeros = &temp_storage.merge_items[0].nonzero; + + ValueT nonzeros[ITEMS_PER_THREAD]; + + // Gather the nonzeros for the merge tile into shared memory + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); + nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); + + OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; + ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; + + ValueT vector_value = spmv_params.t_vector_x[column_idx]; +#if (CUB_PTX_ARCH >= 350) + vector_value = wd_vector_x[column_idx]; +#endif + + nonzeros[ITEM] = value * vector_value; + } + + // Exchange striped->blocked + BlockExchangeT(temp_storage.exchange).StripedToBlocked(nonzeros); + + CTA_SYNC(); + + // Compute an inclusive prefix sum + BlockPrefixSumT(temp_storage.prefix_sum).InclusiveSum(nonzeros, nonzeros); + + CTA_SYNC(); + + if (threadIdx.x == 0) + s_tile_nonzeros[0] = 0.0; + + // Scatter back to smem + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM + 1; + s_tile_nonzeros[item_idx] = nonzeros[ITEM]; + } + + CTA_SYNC(); + + // Gather the row end-offsets for the merge tile into shared memory + #pragma unroll 1 + for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) + { + OffsetT start = CUB_MAX(wd_row_end_offsets[tile_start_coord.x + item - 1], tile_start_coord.y); + OffsetT end = wd_row_end_offsets[tile_start_coord.x + item]; + + start -= tile_start_coord.y; + end -= tile_start_coord.y; + + ValueT row_partial = s_tile_nonzeros[end] - s_tile_nonzeros[start]; + + spmv_params.d_vector_y[tile_start_coord.x + item] = row_partial; + } + + // Get the tile's carry-out + KeyValuePairT tile_carry; + if (threadIdx.x == 0) + { + tile_carry.key = tile_num_rows; + + OffsetT start = CUB_MAX(wd_row_end_offsets[tile_end_coord.x - 1], tile_start_coord.y); + start -= tile_start_coord.y; + OffsetT end = tile_num_nonzeros; + + tile_carry.value = s_tile_nonzeros[end] - s_tile_nonzeros[start]; + } + + // Return the tile's running carry-out + return tile_carry; + } +*/ + + + /** + * Consume input tile + */ + __device__ __forceinline__ void ConsumeTile( + CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates + KeyValuePairT* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block + int num_merge_tiles) ///< [in] Number of merge tiles + { + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + + if (tile_idx >= num_merge_tiles) + return; + + // Read our starting coordinates + if (threadIdx.x < 2) + { + if (d_tile_coordinates == NULL) + { + // Search our starting coordinates + OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS; + CoordinateT tile_coord; + CountingInputIterator nonzero_indices(0); + + // Search the merge path + MergePathSearch( + diagonal, + RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), + nonzero_indices, + spmv_params.num_rows, + spmv_params.num_nonzeros, + tile_coord); + + temp_storage.tile_coords[threadIdx.x] = tile_coord; + } + else + { + temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x]; + } + } + + CTA_SYNC(); + + CoordinateT tile_start_coord = temp_storage.tile_coords[0]; + CoordinateT tile_end_coord = temp_storage.tile_coords[1]; + + // Consume multi-segment tile + KeyValuePairT tile_carry = ConsumeTile( + tile_idx, + tile_start_coord, + tile_end_coord, + Int2Type()); + + // Output the tile's carry-out + if (threadIdx.x == 0) + { + if (HAS_ALPHA) + tile_carry.value *= spmv_params.alpha; + + tile_carry.key += tile_start_coord.x; + d_tile_carry_pairs[tile_idx] = tile_carry; + } + } + + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/single_pass_scan_operators.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/single_pass_scan_operators.cuh new file mode 100644 index 0000000..2f67137 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/single_pass_scan_operators.cuh @@ -0,0 +1,815 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Callback operator types for supplying BlockScan prefixes + */ + +#pragma once + +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../warp/warp_reduce.cuh" +#include "../util_arch.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Prefix functor type for maintaining a running prefix while scanning a + * region independent of other thread blocks + ******************************************************************************/ + +/** + * Stateful callback operator type for supplying BlockScan prefixes. + * Maintains a running prefix that can be applied to consecutive + * BlockScan operations. + */ +template < + typename T, ///< BlockScan value type + typename ScanOpT> ///< Wrapped scan operator type +struct BlockScanRunningPrefixOp +{ + ScanOpT op; ///< Wrapped scan operator + T running_total; ///< Running block-wide prefix + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op) + : + op(op) + {} + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp( + T starting_prefix, + ScanOpT op) + : + op(op), + running_total(starting_prefix) + {} + + /** + * Prefix callback operator. Returns the block-wide running_total in thread-0. + */ + __device__ __forceinline__ T operator()( + const T &block_aggregate) ///< The aggregate sum of the BlockScan inputs + { + T retval = running_total; + running_total = op(running_total, block_aggregate); + return retval; + } +}; + + +/****************************************************************************** + * Generic tile status interface types for block-cooperative scans + ******************************************************************************/ + +/** + * Enumerations of tile status + */ +enum ScanTileStatus +{ + SCAN_TILE_OOB, // Out-of-bounds (e.g., padding) + SCAN_TILE_INVALID = 99, // Not yet processed + SCAN_TILE_PARTIAL, // Tile aggregate is available + SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available +}; + + +/** + * Tile status interface. + */ +template < + typename T, + bool SINGLE_WORD = Traits::PRIMITIVE> +struct ScanTileState; + + +/** + * Tile status interface specialized for scan status and value types + * that can be combined into one machine word that can be + * read/written coherently in a single access. + */ +template +struct ScanTileState +{ + // Status word type + typedef typename If<(sizeof(T) == 8), + long long, + typename If<(sizeof(T) == 4), + int, + typename If<(sizeof(T) == 2), + short, + char>::Type>::Type>::Type StatusWord; + + + // Unit word type + typedef typename If<(sizeof(T) == 8), + longlong2, + typename If<(sizeof(T) == 4), + int2, + typename If<(sizeof(T) == 2), + int, + uchar2>::Type>::Type>::Type TxnWord; + + + // Device word type + struct TileDescriptor + { + StatusWord status; + T value; + }; + + + // Constants + enum + { + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + + // Device storage + TxnWord *d_tile_descriptors; + + /// Constructor + __host__ __device__ __forceinline__ + ScanTileState() + : + d_tile_descriptors(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int /*num_tiles*/, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation + { + d_tile_descriptors = reinterpret_cast(d_temp_storage); + return cudaSuccess; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors + return cudaSuccess; + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + + TxnWord val = TxnWord(); + TileDescriptor *descriptor = reinterpret_cast(&val); + + if (tile_idx < num_tiles) + { + // Not-yet-set + descriptor->status = StatusWord(SCAN_TILE_INVALID); + d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + descriptor->status = StatusWord(SCAN_TILE_OOB); + d_tile_descriptors[threadIdx.x] = val; + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_INCLUSIVE; + tile_descriptor.value = tile_inclusive; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_PARTIAL; + tile_descriptor.value = tile_partial; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + T &value) + { + TileDescriptor tile_descriptor; + do + { + __threadfence_block(); // prevent hoisting loads from loop + TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); + tile_descriptor = reinterpret_cast(alias); + + } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); + + status = tile_descriptor.status; + value = tile_descriptor.value; + } + +}; + + + +/** + * Tile status interface specialized for scan status and value types that + * cannot be combined into one machine word. + */ +template +struct ScanTileState +{ + // Status word type + typedef char StatusWord; + + // Constants + enum + { + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + // Device storage + StatusWord *d_tile_status; + T *d_tile_partial; + T *d_tile_inclusive; + + /// Constructor + __host__ __device__ __forceinline__ + ScanTileState() + : + d_tile_status(NULL), + d_tile_partial(NULL), + d_tile_inclusive(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int num_tiles, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation + { + cudaError_t error = cudaSuccess; + do + { + void* allocations[3]; + size_t allocation_sizes[3]; + + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives + + // Compute allocation pointers into the single storage blob + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Alias the offsets + d_tile_status = reinterpret_cast(allocations[0]); + d_tile_partial = reinterpret_cast(allocations[1]); + d_tile_inclusive = reinterpret_cast(allocations[2]); + } + while (0); + + return error; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + // Specify storage allocation requirements + size_t allocation_sizes[3]; + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives + + // Set the necessary size of the blob + void* allocations[3]; + return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes)); + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_tiles) + { + // Not-yet-set + d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID); + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB); + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) + { + // Update tile inclusive value + ThreadStore(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive); + + // Fence + __threadfence(); + + // Update tile status + ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) + { + // Update tile partial value + ThreadStore(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial); + + // Fence + __threadfence(); + + // Update tile status + ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + T &value) + { + do { + status = ThreadLoad(d_tile_status + TILE_STATUS_PADDING + tile_idx); + + __threadfence(); // prevent hoisting loads from loop or loads below above this one + + } while (status == SCAN_TILE_INVALID); + + if (status == StatusWord(SCAN_TILE_PARTIAL)) + value = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx); + else + value = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); + } +}; + + +/****************************************************************************** + * ReduceByKey tile status interface types for block-cooperative scans + ******************************************************************************/ + +/** + * Tile status interface for reduction by key. + * + */ +template < + typename ValueT, + typename KeyT, + bool SINGLE_WORD = (Traits::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)> +struct ReduceByKeyScanTileState; + + +/** + * Tile status interface for reduction by key, specialized for scan status and value types that + * cannot be combined into one machine word. + */ +template < + typename ValueT, + typename KeyT> +struct ReduceByKeyScanTileState : + ScanTileState > +{ + typedef ScanTileState > SuperClass; + + /// Constructor + __host__ __device__ __forceinline__ + ReduceByKeyScanTileState() : SuperClass() {} +}; + + +/** + * Tile status interface for reduction by key, specialized for scan status and value types that + * can be combined into one machine word that can be read/written coherently in a single access. + */ +template < + typename ValueT, + typename KeyT> +struct ReduceByKeyScanTileState +{ + typedef KeyValuePairKeyValuePairT; + + // Constants + enum + { + PAIR_SIZE = sizeof(ValueT) + sizeof(KeyT), + TXN_WORD_SIZE = 1 << Log2::VALUE, + STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE, + + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + // Status word type + typedef typename If<(STATUS_WORD_SIZE == 8), + long long, + typename If<(STATUS_WORD_SIZE == 4), + int, + typename If<(STATUS_WORD_SIZE == 2), + short, + char>::Type>::Type>::Type StatusWord; + + // Status word type + typedef typename If<(TXN_WORD_SIZE == 16), + longlong2, + typename If<(TXN_WORD_SIZE == 8), + long long, + int>::Type>::Type TxnWord; + + // Device word type (for when sizeof(ValueT) == sizeof(KeyT)) + struct TileDescriptorBigStatus + { + KeyT key; + ValueT value; + StatusWord status; + }; + + // Device word type (for when sizeof(ValueT) != sizeof(KeyT)) + struct TileDescriptorLittleStatus + { + ValueT value; + StatusWord status; + KeyT key; + }; + + // Device word type + typedef typename If< + (sizeof(ValueT) == sizeof(KeyT)), + TileDescriptorBigStatus, + TileDescriptorLittleStatus>::Type + TileDescriptor; + + + // Device storage + TxnWord *d_tile_descriptors; + + + /// Constructor + __host__ __device__ __forceinline__ + ReduceByKeyScanTileState() + : + d_tile_descriptors(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int /*num_tiles*/, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation + { + d_tile_descriptors = reinterpret_cast(d_temp_storage); + return cudaSuccess; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors + return cudaSuccess; + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + TxnWord val = TxnWord(); + TileDescriptor *descriptor = reinterpret_cast(&val); + + if (tile_idx < num_tiles) + { + // Not-yet-set + descriptor->status = StatusWord(SCAN_TILE_INVALID); + d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + descriptor->status = StatusWord(SCAN_TILE_OOB); + d_tile_descriptors[threadIdx.x] = val; + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_INCLUSIVE; + tile_descriptor.value = tile_inclusive.value; + tile_descriptor.key = tile_inclusive.key; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_PARTIAL; + tile_descriptor.value = tile_partial.value; + tile_descriptor.key = tile_partial.key; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + KeyValuePairT &value) + { +// TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); +// TileDescriptor tile_descriptor = reinterpret_cast(alias); +// +// while (tile_descriptor.status == SCAN_TILE_INVALID) +// { +// __threadfence_block(); // prevent hoisting loads from loop +// +// alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); +// tile_descriptor = reinterpret_cast(alias); +// } +// +// status = tile_descriptor.status; +// value.value = tile_descriptor.value; +// value.key = tile_descriptor.key; + + TileDescriptor tile_descriptor; + do + { + __threadfence_block(); // prevent hoisting loads from loop + TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); + tile_descriptor = reinterpret_cast(alias); + + } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); + + status = tile_descriptor.status; + value.value = tile_descriptor.value; + value.key = tile_descriptor.key; + } + +}; + + +/****************************************************************************** + * Prefix call-back operator for coupling local block scan within a + * block-cooperative scan + ******************************************************************************/ + +/** + * Stateful block-scan prefix functor. Provides the the running prefix for + * the current tile by using the call-back warp to wait on on + * aggregates/prefixes from predecessor tiles to become available. + */ +template < + typename T, + typename ScanOpT, + typename ScanTileStateT, + int PTX_ARCH = CUB_PTX_ARCH> +struct TilePrefixCallbackOp +{ + // Parameterized warp reduce + typedef WarpReduce WarpReduceT; + + // Temporary storage type + struct _TempStorage + { + typename WarpReduceT::TempStorage warp_reduce; + T exclusive_prefix; + T inclusive_prefix; + T block_aggregate; + }; + + // Alias wrapper allowing temporary storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + // Type of status word + typedef typename ScanTileStateT::StatusWord StatusWord; + + // Fields + _TempStorage& temp_storage; ///< Reference to a warp-reduction instance + ScanTileStateT& tile_status; ///< Interface to tile status + ScanOpT scan_op; ///< Binary scan operator + int tile_idx; ///< The current tile index + T exclusive_prefix; ///< Exclusive prefix for the tile + T inclusive_prefix; ///< Inclusive prefix for the tile + + // Constructor + __device__ __forceinline__ + TilePrefixCallbackOp( + ScanTileStateT &tile_status, + TempStorage &temp_storage, + ScanOpT scan_op, + int tile_idx) + : + temp_storage(temp_storage.Alias()), + tile_status(tile_status), + scan_op(scan_op), + tile_idx(tile_idx) {} + + + // Block until all predecessors within the warp-wide window have non-invalid status + __device__ __forceinline__ + void ProcessWindow( + int predecessor_idx, ///< Preceding tile index to inspect + StatusWord &predecessor_status, ///< [out] Preceding tile status + T &window_aggregate) ///< [out] Relevant partial reduction from this window of preceding tiles + { + T value; + tile_status.WaitForValid(predecessor_idx, predecessor_status, value); + + // Perform a segmented reduction to get the prefix for the current window. + // Use the swizzled scan operator because we are now scanning *down* towards thread0. + + int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE)); + window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce( + value, + tail_flag, + SwizzleScanOp(scan_op)); + } + + + // BlockScan prefix callback functor (called by the first warp) + __device__ __forceinline__ + T operator()(T block_aggregate) + { + + // Update our status with our tile-aggregate + if (threadIdx.x == 0) + { + temp_storage.block_aggregate = block_aggregate; + tile_status.SetPartial(tile_idx, block_aggregate); + } + + int predecessor_idx = tile_idx - threadIdx.x - 1; + StatusWord predecessor_status; + T window_aggregate; + + // Wait for the warp-wide window of predecessor tiles to become valid + ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); + + // The exclusive tile prefix starts out as the current window aggregate + exclusive_prefix = window_aggregate; + + // Keep sliding the window back until we come across a tile whose inclusive prefix is known + while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff)) + { + predecessor_idx -= CUB_PTX_WARP_THREADS; + + // Update exclusive tile prefix with the window prefix + ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); + exclusive_prefix = scan_op(window_aggregate, exclusive_prefix); + } + + // Compute the inclusive tile prefix and update the status for this tile + if (threadIdx.x == 0) + { + inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); + tile_status.SetInclusive(tile_idx, inclusive_prefix); + + temp_storage.exclusive_prefix = exclusive_prefix; + temp_storage.inclusive_prefix = inclusive_prefix; + } + + // Return exclusive_prefix + return exclusive_prefix; + } + + // Get the exclusive prefix stored in temporary storage + __device__ __forceinline__ + T GetExclusivePrefix() + { + return temp_storage.exclusive_prefix; + } + + // Get the inclusive prefix stored in temporary storage + __device__ __forceinline__ + T GetInclusivePrefix() + { + return temp_storage.inclusive_prefix; + } + + // Get the block aggregate stored in temporary storage + __device__ __forceinline__ + T GetBlockAggregate() + { + return temp_storage.block_aggregate; + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_adjacent_difference.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_adjacent_difference.cuh new file mode 100644 index 0000000..1125fe5 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_adjacent_difference.cuh @@ -0,0 +1,596 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockAdjacentDifference +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /// Shared memory storage layout type (last element from each thread's input) + struct _TempStorage + { + T first_items[BLOCK_THREADS]; + T last_items[BLOCK_THREADS]; + }; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /// Specialization for when FlagOp has third index param + template ::HAS_PARAM> + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx) + { + return flag_op(b, a, idx); + } + }; + + /// Specialization for when FlagOp does not have a third index param + template + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) + { + return flag_op(b, a); + } + }; + + /// Templated unrolling of item comparison (inductive case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + preds[ITERATION] = input[ITERATION - 1]; + + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + preds[ITERATION], + input[ITERATION], + (linear_tid * ITEMS_PER_THREAD) + ITERATION); + + Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op); + } + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + input[ITERATION], + input[ITERATION + 1], + (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1); + + Iterate::FlagTails(linear_tid, flags, input, flag_op); + } + + }; + + /// Templated unrolling of item comparison (termination case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + +public: + + /// \smemstorage{BlockDiscontinuity} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockAdjacentDifference() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockAdjacentDifference( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Head flag operations + *********************************************************************/ + //@{ + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + if (linear_tid == 0) + { + // Set flag for first thread-item (preds[0] is undefined) + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + } + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); + } + + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = temp_storage.last_items[linear_tid - 1]; + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_discontinuity.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_discontinuity.cuh new file mode 100644 index 0000000..428882f --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_discontinuity.cuh @@ -0,0 +1,1148 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png) + * \ingroup BlockModule + * + * \tparam T The data type to be flagged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items + * that differ from their predecessors (or successors). For example, head flags are convenient + * for demarcating disjoint data segments as part of a segmented scan or reduction. + * - \blocked + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockDiscontinuity} + * \par + * The code snippet below illustrates the head flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. + * The corresponding output \p head_flags in those threads will be + * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \par Performance Considerations + * - Incurs zero bank conflicts for most types + * + */ +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockDiscontinuity +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /// Shared memory storage layout type (last element from each thread's input) + struct _TempStorage + { + T first_items[BLOCK_THREADS]; + T last_items[BLOCK_THREADS]; + }; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /// Specialization for when FlagOp has third index param + template ::HAS_PARAM> + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx) + { + return flag_op(a, b, idx); + } + }; + + /// Specialization for when FlagOp does not have a third index param + template + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) + { + return flag_op(a, b); + } + }; + + /// Templated unrolling of item comparison (inductive case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + preds[ITERATION] = input[ITERATION - 1]; + + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + preds[ITERATION], + input[ITERATION], + (linear_tid * ITEMS_PER_THREAD) + ITERATION); + + Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op); + } + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + flags[ITERATION] = ApplyOp::FlagT( + flag_op, + input[ITERATION], + input[ITERATION + 1], + (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1); + + Iterate::FlagTails(linear_tid, flags, input, flag_op); + } + + }; + + /// Templated unrolling of item comparison (termination case) + template + struct Iterate + { + // Head flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagHeads( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + + // Tail flags + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagTails( + int /*linear_tid*/, + FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate + {} + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + +public: + + /// \smemstorage{BlockDiscontinuity} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockDiscontinuity() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockDiscontinuity( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Head flag operations + *********************************************************************/ + //@{ + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + if (linear_tid == 0) + { + // Set flag for first thread-item (preds[0] is undefined) + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + } + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + /** + * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. + * The corresponding output \p head_flags in those threads will be + * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op); + } + + + /** + * \brief Sets head flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is compared + * against \p tile_predecessor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread0 obtain the predecessor item for the entire tile + * int tile_predecessor_item; + * if (threadIdx.x == 0) tile_predecessor_item == ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads( + * head_flags, thread_data, cub::Inequality(), tile_predecessor_item); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }, + * and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those threads will be + * { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + T preds[ITEMS_PER_THREAD]; + FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); + } + + + + //@} end member group + /******************************************************************//** + * \name Tail flag operations + *********************************************************************/ + //@{ + + + /** + * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged. + * + * \par + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute tail flags for discontinuities in the segment + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }. + * The corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is compared + * against \p tile_successor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute tail flags for discontinuities in the segment + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * tail_flags, thread_data, cub::Inequality(), tile_successor_item); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } + * and that \p tile_successor_item is \p 125. The corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + { + // Share first item + temp_storage.first_items[linear_tid] = input[0]; + + CTA_SYNC(); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + //@} end member group + /******************************************************************//** + * \name Head & tail flag operations + *********************************************************************/ + //@{ + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is always flagged. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tail_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } + * and that the tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = temp_storage.last_items[linear_tid - 1]; + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is always flagged. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is compared + * against \p tile_predecessor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } + * and that the tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + if (linear_tid == 0) + { + head_flags[0] = 1; + } + else + { + preds[0] = temp_storage.last_items[linear_tid - 1]; + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + } + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is compared + * against \p tile_predecessor_item. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread0 obtain the predecessor item for the entire tile + * int tile_predecessor_item; + * if (threadIdx.x == 0) tile_predecessor_item == ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, + * thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, + * that the \p tile_predecessor_item is \p 0, and that the + * \p tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage.first_items[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is compared + * against \p tile_predecessor_item. + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is compared + * against \p tile_successor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head- and tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread0 obtain the predecessor item for the entire tile + * int tile_predecessor_item; + * if (threadIdx.x == 0) tile_predecessor_item == ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute head and flags for discontinuities in the segment + * int head_flags[4]; + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, + * thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, + * that the \p tile_predecessor_item is \p 0, and that the + * \p tile_successor_item is \p 125. The corresponding output \p head_flags + * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and the corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeadsAndTails( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first and last items + temp_storage.first_items[linear_tid] = input[0]; + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + T preds[ITEMS_PER_THREAD]; + + // Set flag for first thread-item + preds[0] = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage.last_items[linear_tid - 1]; + + head_flags[0] = ApplyOp::FlagT( + flag_op, + preds[0], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set flag for last thread-item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage.first_items[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); + } + + + + + //@} end member group + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_exchange.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_exchange.cuh new file mode 100644 index 0000000..c0e32fd --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_exchange.cuh @@ -0,0 +1,1248 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png) + * \ingroup BlockModule + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread. + * \tparam WARP_TIME_SLICING [optional] When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - It is commonplace for blocks of threads to rearrange data items between + * threads. For example, the device-accessible memory subsystem prefers access patterns + * where data items are "striped" across threads (where consecutive threads access consecutive items), + * yet most block-wide operations prefer a "blocked" partitioning of items across threads + * (where consecutive items belong to a single thread). + * - BlockExchange supports the following types of data exchanges: + * - Transposing between [blocked](index.html#sec5sec3) and [striped](index.html#sec5sec3) arrangements + * - Transposing between [blocked](index.html#sec5sec3) and [warp-striped](index.html#sec5sec3) arrangements + * - Scattering ranked items to a [blocked arrangement](index.html#sec5sec3) + * - Scattering ranked items to a [striped arrangement](index.html#sec5sec3) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockExchange} + * \par + * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of data striped across threads + * int thread_data[4]; + * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).StripedToBlocked(thread_data); + * + * \endcode + * \par + * Suppose the set of striped input \p thread_data across the block of threads is + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + * \par Performance Considerations + * - Proper device-specific padding ensures zero bank conflicts for most types. + * + */ +template < + typename InputT, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + bool WARP_TIME_SLICING = false, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockExchange +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), + SMEM_BANKS = 1 << LOG_SMEM_BANKS, + + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1, + + TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS, + TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD, + + WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS), + WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD, + + // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) + INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), + PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type + struct __align__(16) _TempStorage + { + InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; + }; + +public: + + /// \smemstorage{BlockExchange} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + unsigned int lane_id; + unsigned int warp_id; + unsigned int warp_offset; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /** + * Transposes data items from blocked arrangement to striped arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void BlockedToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Transposes data items from blocked arrangement to striped arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void BlockedToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Read a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for no timeslicing + */ + template + __device__ __forceinline__ void BlockedToWarpStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + /** + * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for warp-timeslicing + */ + template + __device__ __forceinline__ void BlockedToWarpStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + if (warp_id == 0) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + #pragma unroll + for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE) + { + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + + /** + * Transposes data items from striped arrangement to blocked arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void StripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + // No timeslicing + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Transposes data items from striped arrangement to blocked arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void StripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + // Warp time-slicing + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Write a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + } + + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for no timeslicing + */ + template + __device__ __forceinline__ void WarpStripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for warp-timeslicing + */ + template + __device__ __forceinline__ void WarpStripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + Int2Type /*time_slicing*/) + { + #pragma unroll + for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) + { + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + + /** + * Exchanges data items annotated by rank into blocked arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void ScatterToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + /** + * Exchanges data items annotated by rank into blocked arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void ScatterToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + CTA_SYNC(); + + const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM] - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + + CTA_SYNC(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Exchanges data items annotated by rank into striped arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void ScatterToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + /** + * Exchanges data items annotated by rank into striped arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void ScatterToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type /*time_slicing*/) + { + InputT temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM] - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage.buff[item_offset] = input_items[ITEM]; + } + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Read a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage.buff[item_offset]; + } + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + output_items[ITEM] = temp_items[ITEM]; + } + } + + +public: + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockExchange() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()), + warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockExchange( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + lane_id(LaneId()), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) + {} + + + //@} end member group + /******************************************************************//** + * \name Structured exchanges + *********************************************************************/ + //@{ + + /** + * \brief Transposes data items from striped arrangement to blocked arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of ordered data into a striped arrangement across block threads + * int thread_data[4]; + * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of striped input \p thread_data across the block of threads is + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from device-accessible memory. + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + template + __device__ __forceinline__ void StripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + StripedToBlocked(input_items, output_items, Int2Type()); + } + + + /** + * \brief Transposes data items from blocked arrangement to striped arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively exchange data into a striped arrangement across threads + * BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data); + * + * // Store data striped across block threads into an ordered tile + * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of blocked input \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } in + * preparation for storing to device-accessible memory. + * + */ + template + __device__ __forceinline__ void BlockedToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + BlockedToStriped(input_items, output_items, Int2Type()); + } + + + + /** + * \brief Transposes data items from warp-striped arrangement to blocked arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of ordered data into a warp-striped arrangement across warp threads + * int thread_data[4]; + * cub::LoadSWarptriped(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); + * + * \endcode + * \par + * Suppose the set of warp-striped input \p thread_data across the block of threads is + * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } + * after loading from device-accessible memory. (The first 128 items are striped across + * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + template + __device__ __forceinline__ void WarpStripedToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + WarpStripedToBlocked(input_items, output_items, Int2Type()); + } + + + + /** + * \brief Transposes data items from blocked arrangement to warp-striped arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively exchange data into a warp-striped arrangement across threads + * BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data); + * + * // Store data striped across warp threads into an ordered tile + * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of blocked input \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } + * in preparation for storing to device-accessible memory. (The first 128 items are striped across + * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) + * + */ + template + __device__ __forceinline__ void BlockedToWarpStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + { + BlockedToWarpStriped(input_items, output_items, Int2Type()); + } + + + + //@} end member group + /******************************************************************//** + * \name Scatter exchanges + *********************************************************************/ + //@{ + + + /** + * \brief Exchanges data items annotated by rank into blocked arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToBlocked( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToBlocked(input_items, output_items, ranks, Int2Type()); + } + + + + /** + * \brief Exchanges data items annotated by rank into striped arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToStriped( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToStriped(input_items, output_items, ranks, Int2Type()); + } + + + + /** + * \brief Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToStripedGuarded( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + if (ranks[ITEM] >= 0) + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + + + /** + * \brief Exchanges valid data items annotated by rank into striped arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + * \tparam ValidFlag [inferred] FlagT type denoting which items are valid + */ + template + __device__ __forceinline__ void ScatterToStripedFlagged( + InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. + OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + if (is_valid[ITEM]) + temp_storage.buff[item_offset] = input_items[ITEM]; + } + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + output_items[ITEM] = temp_storage.buff[item_offset]; + } + } + + + //@} end member group + + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + + __device__ __forceinline__ void StripedToBlocked( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + StripedToBlocked(items, items); + } + + __device__ __forceinline__ void BlockedToStriped( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + BlockedToStriped(items, items); + } + + __device__ __forceinline__ void WarpStripedToBlocked( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + WarpStripedToBlocked(items, items); + } + + __device__ __forceinline__ void BlockedToWarpStriped( + InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + BlockedToWarpStriped(items, items); + } + + template + __device__ __forceinline__ void ScatterToBlocked( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToBlocked(items, items, ranks); + } + + template + __device__ __forceinline__ void ScatterToStriped( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToStriped(items, items, ranks); + } + + template + __device__ __forceinline__ void ScatterToStripedGuarded( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToStripedGuarded(items, items, ranks); + } + + template + __device__ __forceinline__ void ScatterToStripedFlagged( + InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity + { + ScatterToStriped(items, items, ranks, is_valid); + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +template < + typename T, + int ITEMS_PER_THREAD, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpExchange +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + /// Constants + enum + { + // Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + WARP_ITEMS = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1, + + LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), + SMEM_BANKS = 1 << LOG_SMEM_BANKS, + + // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) + INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), + PADDING_ITEMS = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type + struct _TempStorage + { + T buff[WARP_ITEMS + PADDING_ITEMS]; + }; + +public: + + /// \smemstorage{WarpExchange} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + int lane_id; + +public: + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpExchange( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + /** + * \brief Exchanges valid data items annotated by rank into striped arrangement. + * + * \par + * - \smemreuse + * + * \tparam OffsetT [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToStriped( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange + OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]); + temp_storage.buff[ranks[ITEM]] = items[ITEM]; + } + + WARP_SYNC(0xffffffff); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + items[ITEM] = temp_storage.buff[item_offset]; + } + } + +}; + + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_histogram.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_histogram.cuh new file mode 100644 index 0000000..5d393c2 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_histogram.cuh @@ -0,0 +1,415 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_histogram_sort.cuh" +#include "specializations/block_histogram_atomic.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms. + */ +enum BlockHistogramAlgorithm +{ + + /** + * \par Overview + * Sorting followed by differentiation. Execution is comprised of two phases: + * -# Sort the data using efficient radix sort + * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts. + * + * \par Performance Considerations + * Delivers consistent throughput regardless of sample bin distribution. + */ + BLOCK_HISTO_SORT, + + + /** + * \par Overview + * Use atomic addition to update byte counts directly + * + * \par Performance Considerations + * Performance is strongly tied to the hardware implementation of atomic + * addition, and may be significantly degraded for non uniformly-random + * input distributions where many concurrent updates are likely to be + * made to the same bin counter. + */ + BLOCK_HISTO_ATOMIC, +}; + + + +/****************************************************************************** + * Block histogram + ******************************************************************************/ + + +/** + * \brief The BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png) + * \ingroup BlockModule + * + * \tparam T The sample type being histogrammed (must be castable to an integer bin identifier) + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items per thread + * \tparam BINS The number bins within the histogram + * \tparam ALGORITHM [optional] cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A histogram + * counts the number of observations that fall into each of the disjoint categories (known as bins). + * - BlockHistogram can be optionally specialized to use different algorithms: + * -# cub::BLOCK_HISTO_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm) + * -# cub::BLOCK_HISTO_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm) + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockHistogram} + * \par + * The code snippet below illustrates a 256-bin histogram of 512 integer samples that + * are partitioned across 128 threads where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char data[4]; + * ... + * + * // Compute the block-wide histogram + * BlockHistogram(temp_storage).Histogram(data, smem_histogram); + * + * \endcode + * + * \par Performance and Usage Considerations + * - The histogram output can be constructed in shared or device-accessible memory + * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives + * + */ +template < + typename T, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + int BINS, + BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockHistogram +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /** + * Ensure the template parameterization meets the requirements of the + * targeted device architecture. BLOCK_HISTO_ATOMIC can only be used + * on version SM120 or later. Otherwise BLOCK_HISTO_SORT is used + * regardless. + */ + static const BlockHistogramAlgorithm SAFE_ALGORITHM = + ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ? + BLOCK_HISTO_SORT : + ALGORITHM; + + /// Internal specialization. + typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT), + BlockHistogramSort, + BlockHistogramAtomic >::Type InternalBlockHistogram; + + /// Shared memory storage layout type for BlockHistogram + typedef typename InternalBlockHistogram::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + +public: + + /// \smemstorage{BlockHistogram} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockHistogram() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockHistogram( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Histogram operations + *********************************************************************/ + //@{ + + + /** + * \brief Initialize the shared histogram counters to zero. + * + * \par Snippet + * The code snippet below illustrates a the initialization and update of a + * histogram of 512 integer samples that are partitioned across 128 threads + * where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Initialize the block-wide histogram + * BlockHistogram(temp_storage).InitHistogram(smem_histogram); + * + * // Update the block-wide histogram + * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam CounterT [inferred] Histogram counter type + */ + template + __device__ __forceinline__ void InitHistogram(CounterT histogram[BINS]) + { + // Initialize histogram bin counts to zeros + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + histogram[histo_offset + linear_tid] = 0; + } + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + histogram[histo_offset + linear_tid] = 0; + } + } + + + /** + * \brief Constructs a block-wide histogram in shared/device-accessible memory. Each thread contributes an array of input elements. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a 256-bin histogram of 512 integer samples that + * are partitioned across 128 threads where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Compute the block-wide histogram + * BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam CounterT [inferred] Histogram counter type + */ + template < + typename CounterT > + __device__ __forceinline__ void Histogram( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + // Initialize histogram bin counts to zeros + InitHistogram(histogram); + + CTA_SYNC(); + + // Composite the histogram + InternalBlockHistogram(temp_storage).Composite(items, histogram); + } + + + + /** + * \brief Updates an existing block-wide histogram in shared/device-accessible memory. Each thread composites an array of input elements. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a the initialization and update of a + * histogram of 512 integer samples that are partitioned across 128 threads + * where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Initialize the block-wide histogram + * BlockHistogram(temp_storage).InitHistogram(smem_histogram); + * + * // Update the block-wide histogram + * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam CounterT [inferred] Histogram counter type + */ + template < + typename CounterT > + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + InternalBlockHistogram(temp_storage).Composite(items, histogram); + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_load.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_load.cuh new file mode 100644 index 0000000..234dad2 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_load.cuh @@ -0,0 +1,1268 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Operations for reading linear tiles of data into the CUDA thread block. + */ + +#pragma once + +#include + +#include "block_exchange.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +/******************************************************************//** + * \name Blocked arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + // Load directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = thread_itr[ITEM]; + } +} + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items) + { + items[ITEM] = thread_itr[ITEM]; + } + } +} + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + typename DefaultT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +{ + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + items[ITEM] = oob_default; + + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); +} + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Internal implementation for load vectorization + */ +template < + CacheLoadModifier MODIFIER, + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void InternalLoadDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + // Biggest memory access word that T is a whole multiple of + typedef typename UnitWord::DeviceWord DeviceWord; + + enum + { + TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord), + + VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ? + 4 : + (TOTAL_WORDS % 2 == 0) ? + 2 : + 1, + + VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE, + }; + + // Vector type + typedef typename CubVector::Type Vector; + + // Vector items + Vector vec_items[VECTORS_PER_THREAD]; + + // Aliased input ptr + Vector* vec_ptr = reinterpret_cast(block_ptr) + (linear_tid * VECTORS_PER_THREAD); + + // Load directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++) + { + vec_items[ITEM] = ThreadLoad(vec_ptr + ITEM); + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = *(reinterpret_cast(vec_items) + ITEM); + } +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block. + * + * \blocked + * + * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned + * + * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ +template < + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void LoadDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); +} + + +//@} end member group +/******************************************************************//** + * \name Striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + InputIteratorT thread_itr = block_itr + linear_tid; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; + } +} + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + InputIteratorT thread_itr = block_itr + linear_tid; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items) + { + items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; + } + } +} + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename InputT, + typename DefaultT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +{ + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + items[ITEM] = oob_default; + + LoadDirectStriped(linear_tid, block_itr, items, valid_items); +} + + + +//@} end member group +/******************************************************************//** + * \name Warp-striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + InputIteratorT thread_itr = block_itr + warp_offset + tid ; + + // Load directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)]; + } +} + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + InputIteratorT thread_itr = block_itr + warp_offset + tid ; + + // Load directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) + { + items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)]; + } + } +} + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + */ +template < + typename InputT, + typename DefaultT, + int ITEMS_PER_THREAD, + typename InputIteratorT> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +{ + // Load directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + items[ITEM] = oob_default; + + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); +} + + + +//@} end member group + +/** @} */ // end group UtilIo + + + +//----------------------------------------------------------------------------- +// Generic BlockLoad abstraction +//----------------------------------------------------------------------------- + +/** + * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. + */ + +/** + * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. + */ +enum BlockLoadAlgorithm +{ + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is read + * directly from memory. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_LOAD_DIRECT, + + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is read + * from memory using CUDA's built-in vectorized loads as a coalescing optimization. + * For example, ld.global.v4.s32 instructions will be generated + * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high until the the + * access stride between threads (i.e., the number items per thread) exceeds the + * maximum vector load width (typically 4 items or 64B, whichever is lower). + * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The \p InputIteratorTis not a simple pointer type + * - The block input offset is not quadword-aligned + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + */ + BLOCK_LOAD_VECTORIZE, + + /** + * \par Overview + * + * A [striped arrangement](index.html#sec5sec3) of data is read + * efficiently from memory and then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items loaded per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + */ + BLOCK_LOAD_TRANSPOSE, + + + /** + * \par Overview + * + * A [warp-striped arrangement](index.html#sec5sec3) of data is + * read efficiently from memory and then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items loaded per thread. + * - The local reordering incurs slightly larger latencies than the + * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + * - Provisions more shared storage, but incurs smaller latencies than the + * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative. + */ + BLOCK_LOAD_WARP_TRANSPOSE, + + + /** + * \par Overview + * + * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [warp-striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and then is locally transposed into a + * [blocked arrangement](index.html#sec5sec3). To reduce the shared memory + * requirement, only one warp's worth of shared memory is provisioned and is + * subsequently time-sliced among warps. + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items loaded per thread. + * - Provisions less shared memory temporary storage, but incurs larger + * latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative. + */ + BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, +}; + + +/** + * \brief The BlockLoad class provides [collective](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [blocked arrangement](index.html#sec5sec3) across a CUDA thread block. ![](block_load_logo.png) + * \ingroup BlockModule + * \ingroup UtilIo + * + * \tparam InputT The data type to read into (which must be convertible from the input iterator's value type). + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. + * \tparam ALGORITHM [optional] cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT. + * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The BlockLoad class provides a single data movement abstraction that can be specialized + * to implement different cub::BlockLoadAlgorithm strategies. This facilitates different + * performance policies for different architectures, data types, granularity sizes, etc. + * - BlockLoad can be optionally specialized by different data movement strategies: + * -# cub::BLOCK_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3) + * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) + * of data is read directly from memory using CUDA's built-in vectorized loads as a + * coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_TRANSPOSE. A [striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and is then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_WARP_TRANSPOSE. A [warp-striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and is then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,. A [warp-striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and is then locally transposed into a + * [blocked arrangement](index.html#sec5sec3) one warp at a time. [More...](\ref cub::BlockLoadAlgorithm) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockLoad} + * \par + * The code snippet below illustrates the loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + */ +template < + typename InputT, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockLoad +{ +private: + + /****************************************************************************** + * Constants and typed definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + /// Load helper + template + struct LoadInternal; + + + /** + * BLOCK_LOAD_DIRECT specialization of load helper + */ + template + struct LoadInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + LoadDirectBlocked(linear_tid, block_itr, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + } + + }; + + + /** + * BLOCK_LOAD_VECTORIZE specialization of load helper + */ + template + struct LoadInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + template + __device__ __forceinline__ void Load( + InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + template + __device__ __forceinline__ void Load( + const InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + template < + CacheLoadModifier MODIFIER, + typename ValueType, + typename OffsetT> + __device__ __forceinline__ void Load( + CacheModifiedInputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); + } + + /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization) + template + __device__ __forceinline__ void Load( + _InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + LoadDirectBlocked(linear_tid, block_itr, items); + } + + /// Load a linear segment of items from memory, guarded by range (skips vectorization) + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization) + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + } + + }; + + + /** + * BLOCK_LOAD_TRANSPOSE specialization of load helper + */ + template + struct LoadInternal + { + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectStriped(linear_tid, block_itr, items); + BlockExchange(temp_storage).StripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + LoadDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); + BlockExchange(temp_storage).StripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + LoadDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default); + BlockExchange(temp_storage).StripedToBlocked(items, items); + } + + }; + + + /** + * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper + */ + template + struct LoadInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectWarpStriped(linear_tid, block_itr, items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + }; + + + /** + * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper + */ + template + struct LoadInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectWarpStriped(linear_tid, block_itr, items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default); + BlockExchange(temp_storage).WarpStripedToBlocked(items, items); + } + }; + + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Internal load implementation to use + typedef LoadInternal InternalLoad; + + + /// Shared memory storage layout type + typedef typename InternalLoad::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + +public: + + /// \smemstorage{BlockLoad} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockLoad() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockLoad( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + + + //@} end member group + /******************************************************************//** + * \name Data movement + *********************************************************************/ + //@{ + + + /** + * \brief Load a linear segment of items from memory. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + */ + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items); + } + + + /** + * \brief Load a linear segment of items from memory, guarded by range. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6... and \p valid_items is \p 5. + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }, with only the first two threads + * being unmasked to load portions of valid data (and other items remaining unassigned). + * + */ + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); + } + + + /** + * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6..., + * \p valid_items is \p 5, and the out-of-bounds default is \p -1. + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }, with only the first two threads + * being unmasked to load portions of valid data (and other items are assigned \p -1) + * + */ + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); + } + + + //@} end member group + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_rank.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_rank.cuh new file mode 100644 index 0000000..77500ba --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_rank.cuh @@ -0,0 +1,697 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block + */ + +#pragma once + +#include + +#include "../thread/thread_reduce.cuh" +#include "../thread/thread_scan.cuh" +#include "../block/block_scan.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block. + * \ingroup BlockModule + * + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam RADIX_BITS The number of radix bits per digit place + * \tparam IS_DESCENDING Whether or not the sorted-order is high-to-low + * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. + * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) + * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * Blah... + * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits). + * - \blocked + * + * \par Performance Considerations + * - \granularity + * + * \par Examples + * \par + * - Example 1: Simple radix rank of 32-bit integer keys + * \code + * #include + * + * template + * __global__ void ExampleKernel(...) + * { + * + * \endcode + */ +template < + int BLOCK_DIM_X, + int RADIX_BITS, + bool IS_DESCENDING, + bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, + BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, + cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockRadixRank +{ +private: + + /****************************************************************************** + * Type definitions and constants + ******************************************************************************/ + + // Integer type for digit counters (to be packed into words of type PackedCounters) + typedef unsigned short DigitCounter; + + // Integer type for packing DigitCounters into columns of shared memory banks + typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte), + unsigned long long, + unsigned int>::Type PackedCounter; + + enum + { + // The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + BYTES_PER_COUNTER = sizeof(DigitCounter), + LOG_BYTES_PER_COUNTER = Log2::VALUE, + + PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), + LOG_PACKING_RATIO = Log2::VALUE, + + LOG_COUNTER_LANES = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0), // Always at least one lane + COUNTER_LANES = 1 << LOG_COUNTER_LANES, + + // The number of packed counters per thread (plus one for padding) + PADDED_COUNTER_LANES = COUNTER_LANES + 1, + RAKING_SEGMENT = PADDED_COUNTER_LANES, + }; + +public: + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS), + }; + +private: + + + /// BlockScan type + typedef BlockScan< + PackedCounter, + BLOCK_DIM_X, + INNER_SCAN_ALGORITHM, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockScan; + + + /// Shared memory storage layout type for BlockRadixRank + struct __align__(16) _TempStorage + { + union Aliasable + { + DigitCounter digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; + PackedCounter raking_grid[BLOCK_THREADS][RAKING_SEGMENT]; + + } aliasable; + + // Storage for scanning local ranks + typename BlockScan::TempStorage block_scan; + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + /// Copy of raking segment, promoted to registers + PackedCounter cached_segment[RAKING_SEGMENT]; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /** + * Internal storage allocator + */ + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /** + * Performs upsweep raking reduction, returning the aggregate + */ + __device__ __forceinline__ PackedCounter Upsweep() + { + PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; + PackedCounter *raking_ptr; + + if (MEMOIZE_OUTER_SCAN) + { + // Copy data into registers + #pragma unroll + for (int i = 0; i < RAKING_SEGMENT; i++) + { + cached_segment[i] = smem_raking_ptr[i]; + } + raking_ptr = cached_segment; + } + else + { + raking_ptr = smem_raking_ptr; + } + + return internal::ThreadReduce(raking_ptr, Sum()); + } + + + /// Performs exclusive downsweep raking scan + __device__ __forceinline__ void ExclusiveDownsweep( + PackedCounter raking_partial) + { + PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; + + PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ? + cached_segment : + smem_raking_ptr; + + // Exclusive raking downsweep scan + internal::ThreadScanExclusive(raking_ptr, raking_ptr, Sum(), raking_partial); + + if (MEMOIZE_OUTER_SCAN) + { + // Copy data back to smem + #pragma unroll + for (int i = 0; i < RAKING_SEGMENT; i++) + { + smem_raking_ptr[i] = cached_segment[i]; + } + } + } + + + /** + * Reset shared memory digit counters + */ + __device__ __forceinline__ void ResetCounters() + { + // Reset shared memory digit counters + #pragma unroll + for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++) + { + *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0; + } + } + + + /** + * Block-scan prefix callback + */ + struct PrefixCallBack + { + __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate) + { + PackedCounter block_prefix = 0; + + // Propagate totals in packed fields + #pragma unroll + for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++) + { + block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED); + } + + return block_prefix; + } + }; + + + /** + * Scan shared memory digit counters. + */ + __device__ __forceinline__ void ScanCounters() + { + // Upsweep scan + PackedCounter raking_partial = Upsweep(); + + // Compute exclusive sum + PackedCounter exclusive_partial; + PrefixCallBack prefix_call_back; + BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back); + + // Downsweep scan with exclusive partial + ExclusiveDownsweep(exclusive_partial); + } + +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockRadixRank() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockRadixRank( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Raking + *********************************************************************/ + //@{ + + /** + * \brief Rank keys. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits) ///< [in] The number of bits in the current digit + { + DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit + DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem + + // Reset shared memory digit counters + ResetCounters(); + + #pragma unroll + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + { + // Get digit + unsigned int digit = BFE(keys[ITEM], current_bit, num_bits); + + // Get sub-counter + unsigned int sub_counter = digit >> LOG_COUNTER_LANES; + + // Get counter lane + unsigned int counter_lane = digit & (COUNTER_LANES - 1); + + if (IS_DESCENDING) + { + sub_counter = PACKING_RATIO - 1 - sub_counter; + counter_lane = COUNTER_LANES - 1 - counter_lane; + } + + // Pointer to smem digit counter + digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter]; + + // Load thread-exclusive prefix + thread_prefixes[ITEM] = *digit_counters[ITEM]; + + // Store inclusive prefix + *digit_counters[ITEM] = thread_prefixes[ITEM] + 1; + } + + CTA_SYNC(); + + // Scan shared memory counters + ScanCounters(); + + CTA_SYNC(); + + // Extract the local ranks of each key + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + { + // Add in thread block exclusive prefix + ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM]; + } + } + + + /** + * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits, ///< [in] The number of bits in the current digit + int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + { + // Rank keys + RankKeys(keys, ranks, current_bit, num_bits); + + // Get the inclusive and exclusive digit totals corresponding to the calling thread. + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + // Obtain ex/inclusive digit counts. (Unfortunately these all reside in the + // first counter column, resulting in unavoidable bank conflicts.) + unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1)); + unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES); + + exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter]; + } + } + } +}; + + + + + +/** + * Radix-rank using match.any + */ +template < + int BLOCK_DIM_X, + int RADIX_BITS, + bool IS_DESCENDING, + BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockRadixRankMatch +{ +private: + + /****************************************************************************** + * Type definitions and constants + ******************************************************************************/ + + typedef int32_t RankT; + typedef int32_t DigitCounterT; + + enum + { + // The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + PADDED_WARPS = ((WARPS & 0x1) == 0) ? + WARPS + 1 : + WARPS, + + COUNTERS = PADDED_WARPS * RADIX_DIGITS, + RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS, + PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ? + RAKING_SEGMENT + 1 : + RAKING_SEGMENT, + }; + +public: + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS), + }; + +private: + + /// BlockScan type + typedef BlockScan< + DigitCounterT, + BLOCK_THREADS, + INNER_SCAN_ALGORITHM, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockScanT; + + + /// Shared memory storage layout type for BlockRadixRank + struct __align__(16) _TempStorage + { + typename BlockScanT::TempStorage block_scan; + + union __align__(16) Aliasable + { + volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS]; + DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT]; + + } aliasable; + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockRadixRankMatch( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Raking + *********************************************************************/ + //@{ + + /** + * \brief Rank keys. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits) ///< [in] The number of bits in the current digit + { + // Initialize shared digit counters + + #pragma unroll + for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) + temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0; + + CTA_SYNC(); + + // Each warp will strip-mine its section of input, one strip at a time + + volatile DigitCounterT *digit_counters[KEYS_PER_THREAD]; + uint32_t lane_id = LaneId(); + uint32_t warp_id = linear_tid >> LOG_WARP_THREADS; + uint32_t lane_mask_lt = LaneMaskLt(); + + #pragma unroll + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + { + // My digit + uint32_t digit = BFE(keys[ITEM], current_bit, num_bits); + + if (IS_DESCENDING) + digit = RADIX_DIGITS - digit - 1; + + // Mask of peers who have same digit as me + uint32_t peer_mask = MatchAny(digit); + + // Pointer to smem digit counter for this key + digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id]; + + // Number of occurrences in previous strips + DigitCounterT warp_digit_prefix = *digit_counters[ITEM]; + + // Warp-sync + WARP_SYNC(0xFFFFFFFF); + + // Number of peers having same digit as me + int32_t digit_count = __popc(peer_mask); + + // Number of lower-ranked peers having same digit seen so far + int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt); + + if (peer_digit_prefix == 0) + { + // First thread for each digit updates the shared warp counter + *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count); + } + + // Warp-sync + WARP_SYNC(0xFFFFFFFF); + + // Number of prior keys having same digit + ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix); + } + + CTA_SYNC(); + + // Scan warp counters + + DigitCounterT scan_counters[PADDED_RAKING_SEGMENT]; + + #pragma unroll + for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) + scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM]; + + BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters); + + #pragma unroll + for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) + temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM]; + + CTA_SYNC(); + + // Seed ranks with counter values from previous warps + #pragma unroll + for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) + ranks[ITEM] += *digit_counters[ITEM]; + } + + + /** + * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits, ///< [in] The number of bits in the current digit + int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + { + RankKeys(keys, ranks, current_bit, num_bits); + + // Get exclusive count for each digit + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + bin_idx = RADIX_DIGITS - bin_idx - 1; + + exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0]; + } + } + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_sort.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_sort.cuh new file mode 100644 index 0000000..736fbde --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_sort.cuh @@ -0,0 +1,862 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockRadixSort class provides [collective](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block. + */ + + +#pragma once + +#include "block_exchange.cuh" +#include "block_radix_rank.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockRadixSort class provides [collective](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method. ![](sorting_logo.png) + * \ingroup BlockModule + * + * \tparam KeyT KeyT type + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items per thread + * \tparam ValueT [optional] ValueT type (default: cub::NullType, which indicates a keys-only sort) + * \tparam RADIX_BITS [optional] The number of radix bits per digit place (default: 4 bits) + * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). + * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) + * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending order. It relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * - BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: + * unsigned char, \p int, \p double, etc. Within each key, the implementation treats fixed-length + * bit-sequences of \p RADIX_BITS as radix digit places. Although the direct radix sorting + * method can only be applied to unsigned integral types, BlockRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * - \rowmajor + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockRadixSort} + * \par + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * ... + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ +template < + typename KeyT, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + typename ValueT = NullType, + int RADIX_BITS = 4, + bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, + BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, + cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockRadixSort +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + // The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + // Whether or not there are values to be trucked along with keys + KEYS_ONLY = Equals::VALUE, + }; + + // KeyT traits and unsigned bits type + typedef Traits KeyTraits; + typedef typename KeyTraits::UnsignedBits UnsignedBits; + + /// Ascending BlockRadixRank utility type + typedef BlockRadixRank< + BLOCK_DIM_X, + RADIX_BITS, + false, + MEMOIZE_OUTER_SCAN, + INNER_SCAN_ALGORITHM, + SMEM_CONFIG, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + AscendingBlockRadixRank; + + /// Descending BlockRadixRank utility type + typedef BlockRadixRank< + BLOCK_DIM_X, + RADIX_BITS, + true, + MEMOIZE_OUTER_SCAN, + INNER_SCAN_ALGORITHM, + SMEM_CONFIG, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + DescendingBlockRadixRank; + + /// BlockExchange utility type for keys + typedef BlockExchange BlockExchangeKeys; + + /// BlockExchange utility type for values + typedef BlockExchange BlockExchangeValues; + + /// Shared memory storage layout type + union _TempStorage + { + typename AscendingBlockRadixRank::TempStorage asending_ranking_storage; + typename DescendingBlockRadixRank::TempStorage descending_ranking_storage; + typename BlockExchangeKeys::TempStorage exchange_keys; + typename BlockExchangeValues::TempStorage exchange_values; + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + /// Rank keys (specialized for ascending sort) + __device__ __forceinline__ void RankKeys( + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + int begin_bit, + int pass_bits, + Int2Type /*is_descending*/) + { + AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys( + unsigned_keys, + ranks, + begin_bit, + pass_bits); + } + + /// Rank keys (specialized for descending sort) + __device__ __forceinline__ void RankKeys( + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + int begin_bit, + int pass_bits, + Int2Type /*is_descending*/) + { + DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys( + unsigned_keys, + ranks, + begin_bit, + pass_bits); + } + + /// ExchangeValues (specialized for key-value sort, to-blocked arrangement) + __device__ __forceinline__ void ExchangeValues( + ValueT (&values)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Int2Type /*is_keys_only*/, + Int2Type /*is_blocked*/) + { + CTA_SYNC(); + + // Exchange values through shared memory in blocked arrangement + BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks); + } + + /// ExchangeValues (specialized for key-value sort, to-striped arrangement) + __device__ __forceinline__ void ExchangeValues( + ValueT (&values)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Int2Type /*is_keys_only*/, + Int2Type /*is_blocked*/) + { + CTA_SYNC(); + + // Exchange values through shared memory in blocked arrangement + BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); + } + + /// ExchangeValues (specialized for keys-only sort) + template + __device__ __forceinline__ void ExchangeValues( + ValueT (&/*values*/)[ITEMS_PER_THREAD], + int (&/*ranks*/)[ITEMS_PER_THREAD], + Int2Type /*is_keys_only*/, + Int2Type /*is_blocked*/) + {} + + /// Sort blocked arrangement + template + __device__ __forceinline__ void SortBlocked( + KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort + int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison + Int2Type is_descending, ///< Tag whether is a descending-order sort + Int2Type is_keys_only) ///< Tag whether is keys-only sort + { + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = + reinterpret_cast(keys); + + // Twiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); + } + + // Radix sorting passes + while (true) + { + int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + + // Rank the blocked keys + int ranks[ITEMS_PER_THREAD]; + RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); + begin_bit += RADIX_BITS; + + CTA_SYNC(); + + // Exchange keys through shared memory in blocked arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); + + // Exchange values through shared memory in blocked arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + // Quit if done + if (begin_bit >= end_bit) break; + + CTA_SYNC(); + } + + // Untwiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); + } + } + +public: + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Sort blocked -> striped arrangement + template + __device__ __forceinline__ void SortBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort + int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison + Int2Type is_descending, ///< Tag whether is a descending-order sort + Int2Type is_keys_only) ///< Tag whether is keys-only sort + { + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = + reinterpret_cast(keys); + + // Twiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); + } + + // Radix sorting passes + while (true) + { + int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + + // Rank the blocked keys + int ranks[ITEMS_PER_THREAD]; + RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); + begin_bit += RADIX_BITS; + + CTA_SYNC(); + + // Check if this is the last pass + if (begin_bit >= end_bit) + { + // Last pass exchanges keys through shared memory in striped arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks); + + // Last pass exchanges through shared memory in striped arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + // Quit + break; + } + + // Exchange keys through shared memory in blocked arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); + + // Exchange values through shared memory in blocked arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + CTA_SYNC(); + } + + // Untwiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); + } + } + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + /// \smemstorage{BlockRadixSort} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockRadixSort() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockRadixSort( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Sorting (blocked arrangements) + *********************************************************************/ + //@{ + + /** + * \brief Performs an ascending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. + * The corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + */ + __device__ __forceinline__ void Sort( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs an ascending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + __device__ __forceinline__ void Sort( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + /** + * \brief Performs a descending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. + * The corresponding output \p thread_keys in those threads will be + * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. + */ + __device__ __forceinline__ void SortDescending( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. + * + */ + __device__ __forceinline__ void SortDescending( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + //@} end member group + /******************************************************************//** + * \name Sorting (blocked arrangement -> striped arrangement) + *********************************************************************/ + //@{ + + + /** + * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. + * + */ + __device__ __forceinline__ void SortBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. + * + */ + __device__ __forceinline__ void SortBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. + * + */ + __device__ __forceinline__ void SortDescendingBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. + * + */ + __device__ __forceinline__ void SortDescendingBlockedToStriped( + KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + //@} end member group + +}; + +/** + * \example example_block_radix_sort.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_raking_layout.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_raking_layout.cuh new file mode 100644 index 0000000..ab6b710 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_raking_layout.cuh @@ -0,0 +1,152 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. + */ + + +#pragma once + +#include "../util_macro.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) + * \ingroup BlockModule + * + * \par Overview + * This type facilitates a shared memory usage pattern where a block of CUDA + * threads places elements into shared memory and then reduces the active + * parallelism to one "raking" warp of threads for serially aggregating consecutive + * sequences of shared items. Padding is inserted to eliminate bank conflicts + * (for most data types). + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_THREADS The thread block size in threads. + * \tparam PTX_ARCH [optional] \ptxversion + */ +template < + typename T, + int BLOCK_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +struct BlockRakingLayout +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// The total number of elements that need to be cooperatively reduced + SHARED_ELEMENTS = BLOCK_THREADS, + + /// Maximum number of warp-synchronous raking threads + MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), + + /// Number of raking elements per warp-synchronous raking thread (rounded up) + SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, + + /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) + RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, + + /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) + HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), + + /// Degree of bank conflicts (e.g., 4-way) + CONFLICT_DEGREE = (HAS_CONFLICTS) ? + (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : + 1, + + /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load + USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2), + + /// Total number of elements in the raking grid + GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING), + + /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) + UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), + }; + + + /** + * \brief Shared memory storage type + */ + struct __align__(16) _TempStorage + { + T buff[BlockRakingLayout::GRID_ELEMENTS]; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /** + * \brief Returns the location for the calling thread to place data into the grid + */ + static __device__ __forceinline__ T* PlacementPtr( + TempStorage &temp_storage, + unsigned int linear_tid) + { + // Offset for partial + unsigned int offset = linear_tid; + + // Add in one padding element for every segment + if (USE_SEGMENT_PADDING > 0) + { + offset += offset / SEGMENT_LENGTH; + } + + // Incorporating a block of padding partials every shared memory segment + return temp_storage.Alias().buff + offset; + } + + + /** + * \brief Returns the location for the calling thread to begin sequential raking + */ + static __device__ __forceinline__ T* RakingPtr( + TempStorage &temp_storage, + unsigned int linear_tid) + { + return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING)); + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_reduce.cuh new file mode 100644 index 0000000..a9de9e7 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_reduce.cuh @@ -0,0 +1,607 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_reduce_raking.cuh" +#include "specializations/block_reduce_raking_commutative_only.cuh" +#include "specializations/block_reduce_warp_reductions.cuh" +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * BlockReduceAlgorithm enumerates alternative algorithms for parallel + * reduction across a CUDA thread block. + */ +enum BlockReduceAlgorithm +{ + + /** + * \par Overview + * An efficient "raking" reduction algorithm that only supports commutative + * reduction operators (true for most operations, e.g., addition). + * + * \par + * Execution is comprised of three phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Threads in warps other than the first warp place + * their partial reductions into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within the first + * warp continue to accumulate by raking across segments of shared partial reductions + * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. + * + * \par + * \image html block_reduce.png + *

\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE + * and is preferable when the reduction operator is commutative. This variant + * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall + * throughput across the GPU when suitably occupied. However, turn-around latency may be + * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable + * when the GPU is under-occupied. + */ + BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, + + + /** + * \par Overview + * An efficient "raking" reduction algorithm that supports commutative + * (e.g., addition) and non-commutative (e.g., string concatenation) reduction + * operators. \blocked. + * + * \par + * Execution is comprised of three phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within a + * single warp rake across segments of shared partial reductions. + * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. + * + * \par + * \image html block_reduce.png + *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant performs more communication than BLOCK_REDUCE_RAKING + * and is only preferable when the reduction operator is non-commutative. This variant + * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall + * throughput across the GPU when suitably occupied. However, turn-around latency may be + * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable + * when the GPU is under-occupied. + */ + BLOCK_REDUCE_RAKING, + + + /** + * \par Overview + * A quick "tiled warp-reductions" reduction algorithm that supports commutative + * (e.g., addition) and non-commutative (e.g., string concatenation) reduction + * operators. + * + * \par + * Execution is comprised of four phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style + * reduction within each warp. + * -# A propagation phase where the warp reduction outputs in each warp are + * updated with the aggregate from each preceding warp. + * + * \par + * \image html block_scan_warpscans.png + *
\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING + * or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall + * throughput across the GPU. However turn-around latency may be lower and + * thus useful when the GPU is under-occupied. + */ + BLOCK_REDUCE_WARP_REDUCTIONS, +}; + + +/****************************************************************************** + * Block reduce + ******************************************************************************/ + +/** + * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png) + * \ingroup BlockModule + * + * \tparam T Data type being reduced + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a list of input elements. + * - \rowmajor + * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles: + * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * + * \par Performance Considerations + * - \granularity + * - Very efficient (only one synchronization barrier). + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic reduction) + * - \p BLOCK_THREADS is a multiple of the architecture's warp size + * - Every thread has a valid input (i.e., full vs. partial-tiles) + * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives + * + * \par A Simple Example + * \blockcollective{BlockReduce} + * \par + * The code snippet below illustrates a sum reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + */ +template < + typename T, + int BLOCK_DIM_X, + BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockReduce +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + typedef BlockReduceWarpReductions WarpReductions; + typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; + typedef BlockReduceRaking Raking; + + /// Internal specialization type + typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS), + WarpReductions, + typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), + RakingCommutativeOnly, + Raking>::Type>::Type InternalBlockReduce; // BlockReduceRaking + + /// Shared memory storage layout type for BlockReduce + typedef typename InternalBlockReduce::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + +public: + + /// \smemstorage{BlockReduce} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockReduce() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockReduce( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Generic reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); + * + * \endcode + * + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op) ///< [in] Binary reduction functor + { + return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); + * + * \endcode + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment + ReductionOp reduction_op) ///< [in] Binary reduction functor + { + // Reduce partials + T partial = internal::ThreadReduce(inputs, reduction_op); + return Reduce(partial, reduction_op); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first \p num_valid threads each contribute one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of a partially-full tile of integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int num_valid, ...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * if (threadIdx.x < num_valid) thread_data = ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); + * + * \endcode + * + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op, ///< [in] Binary reduction functor + int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + { + // Determine if we scan skip bounds checking + if (num_valid >= BLOCK_THREADS) + { + return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); + } + else + { + return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); + } + } + + + //@} end member group + /******************************************************************//** + * \name Summation reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + */ + __device__ __forceinline__ T Sum( + T input) ///< [in] Calling thread's input + { + return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); + } + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ T Sum( + T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment + { + // Reduce partials + T partial = internal::ThreadReduce(inputs, cub::Sum()); + return Sum(partial); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int num_valid, ...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item (up to num_items) + * int thread_data; + * if (threadIdx.x < num_valid) + * thread_data = ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); + * + * \endcode + * + */ + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + { + // Determine if we scan skip bounds checking + if (num_valid >= BLOCK_THREADS) + { + return InternalBlockReduce(temp_storage).template Sum(input, num_valid); + } + else + { + return InternalBlockReduce(temp_storage).template Sum(input, num_valid); + } + } + + + //@} end member group +}; + +/** + * \example example_block_reduce.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_scan.cuh new file mode 100644 index 0000000..245084c --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_scan.cuh @@ -0,0 +1,2126 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_scan_raking.cuh" +#include "specializations/block_scan_warp_scans.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block. + */ +enum BlockScanAlgorithm +{ + + /** + * \par Overview + * An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: + * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions. + * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp. + * -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output. + * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. + * + * \par + * \image html block_scan_raking.png + *
\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - Although this variant may suffer longer turnaround latencies when the + * GPU is under-occupied, it can often provide higher overall throughput + * across the GPU when suitably occupied. + */ + BLOCK_SCAN_RAKING, + + + /** + * \par Overview + * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at + * the expense of higher register pressure. Raking threads preserve their + * "upsweep" segment of values in registers while performing warp-synchronous + * scan, allowing the "downsweep" not to re-read them from shared memory. + */ + BLOCK_SCAN_RAKING_MEMOIZE, + + + /** + * \par Overview + * A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: + * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. + * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. + * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp. + * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. + * + * \par + * \image html block_scan_warpscans.png + *
\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - Although this variant may suffer lower overall throughput across the + * GPU because due to a heavy reliance on inefficient warpscans, it can + * often provide lower turnaround latencies when the GPU is under-occupied. + */ + BLOCK_SCAN_WARP_SCANS, +}; + + +/****************************************************************************** + * Block scan + ******************************************************************************/ + +/** + * \brief The BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png) + * \ingroup BlockModule + * + * \tparam T Data type being scanned + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output list where each element is computed to be the reduction + * of the elements occurring earlier in the input list. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * - \rowmajor + * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: + * -# cub::BLOCK_SCAN_RAKING. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_RAKING_MEMOIZE. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_WARP_SCANS. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) + * + * \par Performance Considerations + * - \granularity + * - Uses special instructions when applicable (e.g., warp \p SHFL) + * - Uses synchronization-free communication between warp lanes when applicable + * - Invokes a minimal number of minimal block-wide synchronization barriers (only + * one or two depending on algorithm selection) + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Prefix sum variants (vs. generic scan) + * - \blocksize + * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives + * + * \par A Simple Example + * \blockcollective{BlockScan} + * \par + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * {[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}. + * The corresponding output \p thread_data in those threads will be + * {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. + * + */ +template < + typename T, + int BLOCK_DIM_X, + BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockScan +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /** + * Ensure the template parameterization meets the requirements of the + * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy + * cannot be used with thread block sizes not a multiple of the + * architectural warp size. + */ + static const BlockScanAlgorithm SAFE_ALGORITHM = + ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ? + BLOCK_SCAN_RAKING : + ALGORITHM; + + typedef BlockScanWarpScans WarpScans; + typedef BlockScanRaking Raking; + + /// Define the delegate type for the desired algorithm + typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS), + WarpScans, + Raking>::Type InternalBlockScan; + + /// Shared memory storage layout type for BlockScan + typedef typename InternalBlockScan::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Public types + ******************************************************************************/ +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockScan() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockScan( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sum operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. + * + * \par + * - \identityzero + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. + * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 0, 1, ..., 127. + * The output for the second segment will be 128, 129, ..., 255. + * + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sum operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. + * + * \par + * - \identityzero + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + * across 128 threads where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage.scan).ExclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 0, 1, 2, 3, ..., 510, 511. + * The output for the second segment will be 512, 513, 514, 515, ..., 1022, 1023. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + + //@} end member group // Exclusive prefix sums + /******************************************************************//** + * \name Exclusive prefix scan operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, ..., 124, 126. + * The output for the second segment will be 126, 128, 128, 130, ..., 252, 254. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op); + } + + + //@} end member group // Inclusive prefix sums + /******************************************************************//** + * \name Exclusive prefix scan operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output \p thread_data in those threads will be + * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op) ///< [in] Binary scan functor + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The + * corresponding output \p thread_data in those threads will be { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. + * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage.scan).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510. + * The output for the second segment will be 510, 512, 512, 514, 514, 516, ..., 1020, 1022. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + //@} end member group +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans + + /******************************************************************//** + * \name Exclusive prefix scan operations (no initial value, single datum per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); + } + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix scan operations (no initial value, multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + // Reduce consecutive thread items in registers + T thread_partial = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_partial, thread_partial, scan_op); + + // Exclusive scan in registers with prefix + internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T thread_partial = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix + internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + + + //@} end member group +#endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans + + /******************************************************************//** + * \name Inclusive prefix sum operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. + * + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + { + InclusiveScan(input, output, cub::Sum()); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. + * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InclusiveScan(input, output, cub::Sum(), block_aggregate); + } + + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 1, 2, ..., 128. + * The output for the second segment will be 129, 130, ..., 256. + * + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix sum operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0]); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be + * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0], block_aggregate); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix, block_aggregate); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + * across 128 threads where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage.scan).IncluisveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 1, 2, 3, 4, ..., 511, 512. + * The output for the second segment will be 513, 514, 515, 516, ..., 1023, 1024. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0], block_prefix_callback_op); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); + } + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scan operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. + * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan( + * thread_data, thread_data, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be 0, 0, 2, 2, ..., 126, 126. + * The output for the second segment will be 128, 128, 130, 130, ..., 254, 254. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scan operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The + * corresponding output \p thread_data in those threads will be { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op); + + // Inclusive scan in registers with prefix as seed (first thread does not seed) + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op, block_aggregate); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan (with no initial value) + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate); + + // Inclusive scan in registers with prefix as seed (first thread does not seed) + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage.scan).InclusiveScan( + * thread_data, thread_data, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be 0, 0, 2, 2, 4, 4, ..., 510, 510. + * The output for the second segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); + } + } + + //@} end member group + + +}; + +/** + * \example example_block_scan.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_shuffle.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_shuffle.cuh new file mode 100644 index 0000000..504f00e --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_shuffle.cuh @@ -0,0 +1,305 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_arch.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. + * \ingroup BlockModule + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * It is commonplace for blocks of threads to rearrange data items between + * threads. The BlockShuffle abstraction allows threads to efficiently shift items + * either (a) up to their successor or (b) down to their predecessor. + * + */ +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockShuffle +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + enum + { + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type (last element from each thread's input) + struct _TempStorage + { + T prev[BLOCK_THREADS]; + T next[BLOCK_THREADS]; + }; + + +public: + + /// \smemstorage{BlockShuffle} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + +public: + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockShuffle() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockShuffle( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Shuffle movement + *********************************************************************/ + //@{ + + + /** + * \brief Each threadi obtains the \p input provided by threadi+distance. The offset \p distance may be negative. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Offset( + T input, ///< [in] The input item from the calling thread (threadi) + T& output, ///< [out] The \p input item from the successor (or predecessor) thread threadi+distance (may be aliased to \p input). This value is only updated for for threadi when 0 <= (i + \p distance) < BLOCK_THREADS-1 + int distance = 1) ///< [in] Offset distance (may be negative) + { + temp_storage[linear_tid].prev = input; + + CTA_SYNC(); + + if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS)) + output = temp_storage[linear_tid + distance].prev; + } + + + /** + * \brief Each threadi obtains the \p input provided by threadi+distance. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Rotate( + T input, ///< [in] The calling thread's input item + T& output, ///< [out] The \p input item from thread thread(i+distance>)% (may be aliased to \p input). This value is not updated for threadBLOCK_THREADS-1 + unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < BLOCK_THREADS) + { + temp_storage[linear_tid].prev = input; + + CTA_SYNC(); + + unsigned int offset = threadIdx.x + distance; + if (offset >= BLOCK_THREADS) + offset -= BLOCK_THREADS; + + output = temp_storage[offset].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Up( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. + { + temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) + prev[ITEM] = input[ITEM - 1]; + + + if (linear_tid > 0) + prev[0] = temp_storage[linear_tid - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item. All threads receive the \p input provided by threadBLOCK_THREADS-1. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Up( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. + T &block_suffix) ///< [out] The item \p input[ITEMS_PER_THREAD-1] from threadBLOCK_THREADS-1, provided to all threads + { + Up(input, prev); + block_suffix = temp_storage[BLOCK_THREADS - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it down by one item + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Down( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. + { + temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) + prev[ITEM] = input[ITEM - 1]; + + if (linear_tid > 0) + prev[0] = temp_storage[linear_tid - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of input items, shifting it down by one item. All threads receive \p input[0] provided by thread0. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Down( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. + T &block_prefix) ///< [out] The item \p input[0] from thread0, provided to all threads + { + Up(input, prev); + block_prefix = temp_storage[BLOCK_THREADS - 1].prev; + } + + //@} end member group + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_store.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_store.cuh new file mode 100644 index 0000000..63039af --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_store.cuh @@ -0,0 +1,1000 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Operations for writing linear segments of data from the CUDA thread block + */ + +#pragma once + +#include + +#include "block_exchange.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +/******************************************************************//** + * \name Blocked arrangement I/O (direct) + *********************************************************************/ +//@{ + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * + * \blocked + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + // Store directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[ITEM] = items[ITEM]; + } +} + + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range + * + * \blocked + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + + // Store directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items) + { + thread_itr[ITEM] = items[ITEM]; + } + } +} + + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * + * \blocked + * + * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned, + * which is the default starting offset returned by \p cudaMalloc() + * + * \par + * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * + */ +template < + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void StoreDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for storing from + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + enum + { + // Maximum CUDA vector size is 4 elements + MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD), + + // Vector size must be a power of two and an even divisor of the items per thread + VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? + MAX_VEC_SIZE : + 1, + + VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE, + }; + + // Vector type + typedef typename CubVector::Type Vector; + + // Alias global pointer + Vector *block_ptr_vectors = reinterpret_cast(const_cast(block_ptr)); + + // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling) + Vector raw_vector[VECTORS_PER_THREAD]; + T *raw_items = reinterpret_cast(raw_vector); + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + raw_items[ITEM] = items[ITEM]; + } + + // Direct-store using vector types + StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector); +} + + + +//@} end member group +/******************************************************************//** + * \name Striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Store a striped arrangement of data across the thread block into a linear segment of items. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + OutputIteratorT thread_itr = block_itr + linear_tid; + + // Store directly in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; + } +} + + +/** + * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + OutputIteratorT thread_itr = block_itr + linear_tid; + + // Store directly in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items) + { + thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; + } + } +} + + + +//@} end member group +/******************************************************************//** + * \name Warp-striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + OutputIteratorT thread_itr = block_itr + warp_offset + tid; + + // Store directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + } +} + + +/** + * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIteratorT> +__device__ __forceinline__ void StoreDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + OutputIteratorT thread_itr = block_itr + warp_offset + tid; + + // Store directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) + { + thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + } + } +} + + +//@} end member group + + +/** @} */ // end group UtilIo + + +//----------------------------------------------------------------------------- +// Generic BlockStore abstraction +//----------------------------------------------------------------------------- + +/** + * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory. + */ +enum BlockStoreAlgorithm +{ + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is written + * directly to memory. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_STORE_DIRECT, + + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is written directly + * to memory using CUDA's built-in vectorized stores as a coalescing optimization. + * For example, st.global.v4.s32 instructions will be generated + * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high until the the + * access stride between threads (i.e., the number items per thread) exceeds the + * maximum vector store width (typically 4 items or 64B, whichever is lower). + * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The \p OutputIteratorT is not a simple pointer type + * - The block output offset is not quadword-aligned + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + */ + BLOCK_STORE_VECTORIZE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a [striped arrangement](index.html#sec5sec3). + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. + */ + BLOCK_STORE_TRANSPOSE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a + * [warp-striped arrangement](index.html#sec5sec3) + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. + */ + BLOCK_STORE_WARP_TRANSPOSE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a + * [warp-striped arrangement](index.html#sec5sec3) + * To reduce the shared memory requirement, only one warp's worth of shared + * memory is provisioned and is subsequently time-sliced among warps. + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - Provisions less shared memory temporary storage, but incurs larger + * latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative. + */ + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + +}; + + +/** + * \brief The BlockStore class provides [collective](index.html#sec0) data movement methods for writing a [blocked arrangement](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory. ![](block_store_logo.png) + * \ingroup BlockModule + * \ingroup UtilIo + * + * \tparam T The type of data to be written. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. + * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. + * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The BlockStore class provides a single data movement abstraction that can be specialized + * to implement different cub::BlockStoreAlgorithm strategies. This facilitates different + * performance policies for different architectures, data types, granularity sizes, etc. + * - BlockStore can be optionally specialized by different data movement strategies: + * -# cub::BLOCK_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) of data is written + * directly to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) + * of data is written directly to memory using CUDA's built-in vectorized stores as a + * coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) + * is locally transposed into a [striped arrangement](index.html#sec5sec3) which is + * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_WARP_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) + * is locally transposed into a [warp-striped arrangement](index.html#sec5sec3) which is + * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockStore} + * \par + * The code snippet below illustrates the storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * + */ +template < + typename T, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockStore +{ +private: + /****************************************************************************** + * Constants and typed definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + /// Store helper + template + struct StoreInternal; + + + /** + * BLOCK_STORE_DIRECT specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlocked(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectBlocked(linear_tid, block_itr, items, valid_items); + } + }; + + + /** + * BLOCK_STORE_VECTORIZE specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization) + __device__ __forceinline__ void Store( + T *block_ptr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization) + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlocked(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectBlocked(linear_tid, block_itr, items, valid_items); + } + }; + + + /** + * BLOCK_STORE_TRANSPOSE specialization of store helper + */ + template + struct StoreInternal + { + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToStriped(items); + StoreDirectStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + + /** + * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper + */ + template + struct StoreInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + StoreDirectWarpStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + + /** + * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper + */ + template + struct StoreInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + StoreDirectWarpStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Internal load implementation to use + typedef StoreInternal InternalStore; + + + /// Shared memory storage layout type + typedef typename InternalStore::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + +public: + + + /// \smemstorage{BlockStore} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockStore() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockStore( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Data movement + *********************************************************************/ + //@{ + + + /** + * \brief Store items into a linear segment of memory. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * + */ + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + InternalStore(temp_storage, linear_tid).Store(block_itr, items); + } + + /** + * \brief Store items into a linear segment of memory, guarded by range. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data, valid_items); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and \p valid_items is \p 5. + * The output \p d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with + * only the first two threads being unmasked to store portions of valid data. + * + */ + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_atomic.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_atomic.cuh new file mode 100644 index 0000000..4599c09 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_atomic.cuh @@ -0,0 +1,82 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ +template +struct BlockHistogramAtomic +{ + /// Shared memory storage layout type + struct TempStorage {}; + + + /// Constructor + __device__ __forceinline__ BlockHistogramAtomic( + TempStorage &temp_storage) + {} + + + /// Composite data onto an existing histogram + template < + typename T, + typename CounterT, + int ITEMS_PER_THREAD> + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + // Update histogram + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { + atomicAdd(histogram + items[i], 1); + } + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_sort.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_sort.cuh new file mode 100644 index 0000000..b9ad6fb --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_sort.cuh @@ -0,0 +1,226 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../../block/block_radix_sort.cuh" +#include "../../block/block_discontinuity.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/** + * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ +template < + typename T, ///< Sample type + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int ITEMS_PER_THREAD, ///< The number of samples per thread + int BINS, ///< The number of bins into which histogram samples may fall + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockHistogramSort +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // Parameterize BlockRadixSort type for our thread block + typedef BlockRadixSort< + T, + BLOCK_DIM_X, + ITEMS_PER_THREAD, + NullType, + 4, + (PTX_ARCH >= 350) ? true : false, + BLOCK_SCAN_WARP_SCANS, + cudaSharedMemBankSizeFourByte, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockRadixSortT; + + // Parameterize BlockDiscontinuity type for our thread block + typedef BlockDiscontinuity< + T, + BLOCK_DIM_X, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockDiscontinuityT; + + /// Shared memory + union _TempStorage + { + // Storage for sorting bin values + typename BlockRadixSortT::TempStorage sort; + + struct + { + // Storage for detecting discontinuities in the tile of sorted bin values + typename BlockDiscontinuityT::TempStorage flag; + + // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values + unsigned int run_begin[BINS]; + unsigned int run_end[BINS]; + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockHistogramSort( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + // Discontinuity functor + struct DiscontinuityOp + { + // Reference to temp_storage + _TempStorage &temp_storage; + + // Constructor + __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : + temp_storage(temp_storage) + {} + + // Discontinuity predicate + __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index) + { + if (a != b) + { + // Note the begin/end offsets in shared storage + temp_storage.run_begin[b] = b_index; + temp_storage.run_end[a] = b_index; + + return true; + } + else + { + return false; + } + } + }; + + + // Composite data onto an existing histogram + template < + typename CounterT > + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + { + enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; + + // Sort bytes in blocked arrangement + BlockRadixSortT(temp_storage.sort).Sort(items); + + CTA_SYNC(); + + // Initialize the shared memory's run_begin and run_end for each bin + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; + temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; + } + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; + temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; + } + + CTA_SYNC(); + + int flags[ITEMS_PER_THREAD]; // unused + + // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile + DiscontinuityOp flag_op(temp_storage); + BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); + + // Update begin for first item + if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0; + + CTA_SYNC(); + + // Composite into histogram + histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + int thread_offset = histo_offset + linear_tid; + CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + histogram[thread_offset] += count; + } + + // Finish up with guarded composition if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + int thread_offset = histo_offset + linear_tid; + CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + histogram[thread_offset] += count; + } + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking.cuh new file mode 100644 index 0000000..c2c2665 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking.cuh @@ -0,0 +1,222 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + */ + +#pragma once + +#include "../../block/block_raking_layout.cuh" +#include "../../warp/warp_reduce.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + * + * Supports non-commutative binary reduction operators. Unlike commutative + * reduction operators (e.g., addition), the application of a non-commutative + * reduction operator (e.g, string concatenation) across a sequence of inputs must + * honor the relative ordering of items and partial reductions when applying the + * reduction operator. + * + * Compared to the implementation of BlockReduceRaking (which does not support + * non-commutative operators), this implementation requires a few extra + * rounds of inter-thread communication. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceRaking +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// WarpReduce utility type + typedef typename WarpReduce::InternalWarpReduce WarpReduce; + + /// Constants + enum + { + /// Number of raking threads + RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, + + /// Cooperative work can be entirely warp synchronous + WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS), + + /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two + WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo::VALUE, + + /// Whether or not accesses into smem are unguarded + RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED, + + }; + + + /// Shared memory storage layout type + union _TempStorage + { + typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockReduceRaking( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + template + __device__ __forceinline__ T RakingReduction( + ReductionOp reduction_op, ///< [in] Binary scan operator + T *raking_segment, + T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*iteration*/) + { + // Update partial if addend is in range + if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) + { + T addend = raking_segment[ITERATION]; + partial = reduction_op(partial, addend); + } + return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); + } + + template + __device__ __forceinline__ T RakingReduction( + ReductionOp /*reduction_op*/, ///< [in] Binary scan operator + T * /*raking_segment*/, + T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*iteration*/) + { + return partial; + } + + + + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool IS_FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) + partial = WarpReduce(temp_storage.warp_storage).template Reduce( + partial, + num_valid, + reduction_op); + } + else + { + // Place partial into shared memory grid. + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; + + CTA_SYNC(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = raking_segment[0]; + + partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); + + partial = WarpReduce(temp_storage.warp_storage).template Reduce( + partial, + num_valid, + reduction_op); + + } + } + + return partial; + } + + + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + cub::Sum reduction_op; + + return Reduce(partial, num_valid, reduction_op); + } + + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh new file mode 100644 index 0000000..ee22946 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -0,0 +1,199 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. + */ + +#pragma once + +#include "block_reduce_raking.cuh" +#include "../../warp/warp_reduce.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceRakingCommutativeOnly +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values + typedef BlockReduceRaking FallBack; + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// Whether or not to use fall-back + USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), + + /// Number of raking threads + RAKING_THREADS = WARP_THREADS, + + /// Number of threads actually sharing items with the raking threads + SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, + }; + + /// WarpReduce utility type + typedef WarpReduce WarpReduce; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// Shared memory storage layout type + union _TempStorage + { + struct + { + typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid + }; + typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockReduceRakingCommutativeOnly( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + if (USE_FALLBACK || !FULL_TILE) + { + return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid); + } + else + { + // Place partial into shared memory grid + if (linear_tid >= RAKING_THREADS) + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; + + CTA_SYNC(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = internal::ThreadReduce(raking_segment, cub::Sum(), partial); + + // Warpscan + partial = WarpReduce(temp_storage.warp_storage).Sum(partial); + } + } + + return partial; + } + + + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + if (USE_FALLBACK || !FULL_TILE) + { + return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op); + } + else + { + // Place partial into shared memory grid + if (linear_tid >= RAKING_THREADS) + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; + + CTA_SYNC(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = internal::ThreadReduce(raking_segment, reduction_op, partial); + + // Warpscan + partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op); + } + } + + return partial; + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_warp_reductions.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_warp_reductions.cuh new file mode 100644 index 0000000..68495b4 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -0,0 +1,222 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + */ + +#pragma once + +#include "../../warp/warp_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_arch.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceWarpReductions +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + /// The logical warp size for warp reductions + LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), + + /// Whether or not the logical warp size evenly divides the thread block size + EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) + }; + + + /// WarpReduce utility type + typedef typename WarpReduce::InternalWarpReduce WarpReduce; + + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan + T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan + T block_prefix; ///< Shared prefix for the entire thread block + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + /// Constructor + __device__ __forceinline__ BlockReduceWarpReductions( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + template + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*successor_warp*/) + { + if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) + { + T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP]; + warp_aggregate = reduction_op(warp_aggregate, addend); + } + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type()); + } + + template + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp /*reduction_op*/, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*successor_warp*/) + { + return warp_aggregate; + } + + + /// Returns block-wide aggregate in thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + // Share lane aggregates + if (lane_id == 0) + { + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + } + + CTA_SYNC(); + + // Update total aggregate in warp 0, lane 0 + if (linear_tid == 0) + { + warp_aggregate = ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type<1>()); + } + + return warp_aggregate; + } + + + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + cub::Sum reduction_op; + unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; + unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? + LOGICAL_WARP_SIZE : + (warp_offset < num_valid) ? + num_valid - warp_offset : + 0; + + // Warp reduction in every warp + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( + input, + warp_num_valid, + cub::Sum()); + + // Update outputs and block_aggregate with warp-wide aggregates from lane-0s + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); + } + + + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; + unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? + LOGICAL_WARP_SIZE : + (warp_offset < static_cast(num_valid)) ? + num_valid - warp_offset : + 0; + + // Warp reduction in every warp + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( + input, + warp_num_valid, + reduction_op); + + // Update outputs and block_aggregate with warp-wide aggregates from lane-0s + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_raking.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_raking.cuh new file mode 100644 index 0000000..2e21324 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_raking.cuh @@ -0,0 +1,666 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + + +/** + * \file + * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_ptx.cuh" +#include "../../util_arch.cuh" +#include "../../block/block_raking_layout.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../thread/thread_scan.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. + */ +template < + typename T, ///< Data type being scanned + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanRaking +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// Constants + enum + { + /// Number of raking threads + RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, + + /// Cooperative work can be entirely warp synchronous + WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS), + }; + + /// WarpScan utility type + typedef WarpScan WarpScan; + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid + T block_aggregate; ///< Block aggregate + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + T cached_segment[SEGMENT_LENGTH]; + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + /// Templated reduction + template + __device__ __forceinline__ T GuardedReduce( + T* raking_ptr, ///< [in] Input array + ScanOp scan_op, ///< [in] Binary reduction operator + T raking_partial, ///< [in] Prefix to seed reduction with + Int2Type /*iteration*/) + { + if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) + { + T addend = raking_ptr[ITERATION]; + raking_partial = scan_op(raking_partial, addend); + } + + return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type()); + } + + + /// Templated reduction (base case) + template + __device__ __forceinline__ T GuardedReduce( + T* /*raking_ptr*/, ///< [in] Input array + ScanOp /*scan_op*/, ///< [in] Binary reduction operator + T raking_partial, ///< [in] Prefix to seed reduction with + Int2Type /*iteration*/) + { + return raking_partial; + } + + + /// Templated copy + template + __device__ __forceinline__ void CopySegment( + T* out, ///< [out] Out array + T* in, ///< [in] Input array + Int2Type /*iteration*/) + { + out[ITERATION] = in[ITERATION]; + CopySegment(out, in, Int2Type()); + } + + + /// Templated copy (base case) + __device__ __forceinline__ void CopySegment( + T* /*out*/, ///< [out] Out array + T* /*in*/, ///< [in] Input array + Int2Type /*iteration*/) + {} + + + /// Performs upsweep raking reduction, returning the aggregate + template + __device__ __forceinline__ T Upsweep( + ScanOp scan_op) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data into registers + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + + T raking_partial = cached_segment[0]; + + return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>()); + } + + + /// Performs exclusive downsweep raking scan + template + __device__ __forceinline__ void ExclusiveDownsweep( + ScanOp scan_op, + T raking_partial, + bool apply_prefix = true) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data back into registers + if (!MEMOIZE) + { + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + } + + internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + + // Write data back to smem + CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); + } + + + /// Performs inclusive downsweep raking scan + template + __device__ __forceinline__ void InclusiveDownsweep( + ScanOp scan_op, + T raking_partial, + bool apply_prefix = true) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data back into registers + if (!MEMOIZE) + { + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + } + + internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + + // Write data back to smem + CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); + } + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanRaking( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + exclusive_output = *placement_ptr; + } + } + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Exclusive Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial); + } + + CTA_SYNC(); + + // Grab exclusive partial from shared memory + output = *placement_ptr; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial= Upsweep(scan_op); + + // Warp-synchronous scan + T inclusive_partial; + T exclusive_partial; + WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + + // Broadcast aggregate to all threads + if (linear_tid == RAKING_THREADS - 1) + temp_storage.block_aggregate = inclusive_partial; + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, exclusive_partial); + + // Broadcast aggregate to other threads + if (linear_tid == 0) + temp_storage.block_aggregate = block_aggregate; + } + + CTA_SYNC(); + + // Grab exclusive partial from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + T block_aggregate; + WarpScan warp_scan(temp_storage.warp_scan); + warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate); + + // Obtain warp-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + output = scan_op(block_prefix, output); + if (linear_tid == 0) + output = block_prefix; + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + WarpScan warp_scan(temp_storage.warp_scan); + + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial, block_aggregate; + warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); + + // Obtain block-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + // Update prefix with warpscan exclusive partial + T downsweep_prefix = scan_op(block_prefix, exclusive_partial); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, downsweep_prefix); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + } + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Exclusive Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T inclusive_partial; + T exclusive_partial; + WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + + // Broadcast aggregate to all threads + if (linear_tid == RAKING_THREADS - 1) + temp_storage.block_aggregate = inclusive_partial; + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp-synchronous scan + T block_aggregate; + WarpScan warp_scan(temp_storage.warp_scan); + warp_scan.InclusiveScan(input, output, scan_op, block_aggregate); + + // Obtain warp-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + // Update prefix with exclusive warpscan partial + output = scan_op(block_prefix, output); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + CTA_SYNC(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + WarpScan warp_scan(temp_storage.warp_scan); + + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial, block_aggregate; + warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); + + // Obtain block-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + // Update prefix with warpscan exclusive partial + T downsweep_prefix = scan_op(block_prefix, exclusive_partial); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, downsweep_prefix); + } + + CTA_SYNC(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans.cuh new file mode 100644 index 0000000..9252c0a --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans.cuh @@ -0,0 +1,392 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /// WarpScan utility type + typedef WarpScan WarpScanT; + + /// WarpScan utility type + typedef WarpScan WarpAggregateScan; + + /// Shared memory storage layout type + + struct __align__(32) _TempStorage + { + T warp_aggregates[WARPS]; + typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans + T block_prefix; ///< Shared prefix for the entire thread block + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &warp_prefix, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type /*addend_warp*/) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); + } + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &/*warp_prefix*/, ///< [out] The calling thread's partial reduction + ScanOp /*scan_op*/, ///< [in] Binary scan operator + T &/*block_aggregate*/, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type /*addend_warp*/) + {} + + + /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + + CTA_SYNC(); + + // Accumulate block aggregates and save the one that is our warp's prefix + T warp_prefix; + block_aggregate = temp_storage.warp_aggregates[0]; + + // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); +/* + #pragma unroll + for (int WARP = 1; WARP < WARPS; ++WARP) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + } +*/ + + return warp_prefix; + } + + + /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + const T &initial_value) ///< [in] Initial value to seed the exclusive scan + { + T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); + + warp_prefix = scan_op(initial_value, warp_prefix); + + if (warp_id == 0) + warp_prefix = initial_value; + + return warp_prefix; + } + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); + + // Apply warp prefix to our lane's partial + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + if (linear_tid > 0) + { + exclusive_output = scan_op(block_prefix, exclusive_output); + } + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + inclusive_output = scan_op(warp_prefix, inclusive_output); + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + T block_aggregate; + InclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + exclusive_output = scan_op(block_prefix, exclusive_output); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans2.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans2.cuh new file mode 100644 index 0000000..eb0a3a1 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans2.cuh @@ -0,0 +1,436 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /// WarpScan utility type + typedef WarpScan WarpScanT; + + /// WarpScan utility type + typedef WarpScan WarpAggregateScanT; + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpAggregateScanT::TempStorage inner_scan[WARPS]; ///< Buffer for warp-synchronous scans + typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans + T warp_aggregates[WARPS]; + T block_prefix; ///< Shared prefix for the entire thread block + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &warp_prefix, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type addend_warp) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); + } + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &warp_prefix, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type addend_warp) + {} + + + /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + + CTA_SYNC(); + + // Accumulate block aggregates and save the one that is our warp's prefix + T warp_prefix; + block_aggregate = temp_storage.warp_aggregates[0]; + + // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); +/* + #pragma unroll + for (int WARP = 1; WARP < WARPS; ++WARP) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + } +*/ + + return warp_prefix; + } + + + /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + const T &initial_value) ///< [in] Initial value to seed the exclusive scan + { + T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); + + warp_prefix = scan_op(initial_value, warp_prefix); + + if (warp_id == 0) + warp_prefix = initial_value; + + return warp_prefix; + } + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); + + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. +// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + +//-------------------------------------------------- + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + // Get the warp scan partial + T warp_inclusive, warp_prefix; + if (lane_id < WARPS) + { + // Scan the warpscan partials + T warp_val = temp_storage.warp_aggregates[lane_id]; + WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op); + } + + warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); + block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); +//-------------------------------------------------- + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); + + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp +// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); + +//-------------------------------------------------- + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + // Get the warp scan partial + T warp_inclusive, warp_prefix; + if (lane_id < WARPS) + { + // Scan the warpscan partials + T warp_val = temp_storage.warp_aggregates[lane_id]; + WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op); + } + + warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); + block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); +//-------------------------------------------------- + + // Apply warp prefix to our lane's partial + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + if (linear_tid > 0) + { + exclusive_output = scan_op(block_prefix, exclusive_output); + } + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + inclusive_output = scan_op(warp_prefix, inclusive_output); + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + T block_aggregate; + InclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + exclusive_output = scan_op(block_prefix, exclusive_output); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans3.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans3.cuh new file mode 100644 index 0000000..18bd585 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans3.cuh @@ -0,0 +1,418 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of warp threads + INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS, + + /// Number of outer scan warps + OUTER_WARPS = INNER_WARP_THREADS + }; + + /// Outer WarpScan utility type + typedef WarpScan OuterWarpScanT; + + /// Inner WarpScan utility type + typedef WarpScan InnerWarpScanT; + + typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS]; + + + /// Shared memory storage layout type + struct _TempStorage + { + union Aliasable + { + Uninitialized outer_warp_scan; ///< Buffer for warp-synchronous outer scans + typename InnerWarpScanT::TempStorage inner_warp_scan; ///< Buffer for warp-synchronous inner scan + + } aliasable; + + T warp_aggregates[OUTER_WARPS]; + + T block_aggregate; ///< Shared prefix for the entire thread block + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS), + lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS) + {} + + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + if (warp_id != 0) + { + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + { + temp_storage.warp_aggregates[warp_id] = inclusive_output; + } + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); + + T upsweep = temp_storage.warp_aggregates[linear_tid]; + T downsweep_prefix, block_aggregate; + + inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); + + // Use callback functor to get block prefix in lane0 and then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = inner_scan.Broadcast(block_prefix, 0); + + downsweep_prefix = scan_op(block_prefix, downsweep_prefix); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; + } + + CTA_SYNC(); + + // Apply warp prefix to our lane's partial (or assign it if partial is invalid) + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( + input, inclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + if (warp_id != 0) + { + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( + input, inclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); + + T upsweep = temp_storage.warp_aggregates[linear_tid]; + T downsweep_prefix, block_aggregate; + inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); + + // Use callback functor to get block prefix in lane0 and then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = inner_scan.Broadcast(block_prefix, 0); + + downsweep_prefix = scan_op(block_prefix, downsweep_prefix); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; + } + + CTA_SYNC(); + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/cub.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/cub.cuh new file mode 100644 index 0000000..b1c8e32 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/cub.cuh @@ -0,0 +1,95 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * CUB umbrella include file + */ + +#pragma once + + +// Block +#include "block/block_histogram.cuh" +#include "block/block_discontinuity.cuh" +#include "block/block_exchange.cuh" +#include "block/block_load.cuh" +#include "block/block_radix_rank.cuh" +#include "block/block_radix_sort.cuh" +#include "block/block_reduce.cuh" +#include "block/block_scan.cuh" +#include "block/block_store.cuh" +//#include "block/block_shift.cuh" + +// Device +#include "device/device_histogram.cuh" +#include "device/device_partition.cuh" +#include "device/device_radix_sort.cuh" +#include "device/device_reduce.cuh" +#include "device/device_run_length_encode.cuh" +#include "device/device_scan.cuh" +#include "device/device_segmented_radix_sort.cuh" +#include "device/device_segmented_reduce.cuh" +#include "device/device_select.cuh" +#include "device/device_spmv.cuh" + +// Grid +//#include "grid/grid_barrier.cuh" +#include "grid/grid_even_share.cuh" +#include "grid/grid_mapping.cuh" +#include "grid/grid_queue.cuh" + +// Thread +#include "thread/thread_load.cuh" +#include "thread/thread_operators.cuh" +#include "thread/thread_reduce.cuh" +#include "thread/thread_scan.cuh" +#include "thread/thread_store.cuh" + +// Warp +#include "warp/warp_reduce.cuh" +#include "warp/warp_scan.cuh" + +// Iterator +#include "iterator/arg_index_input_iterator.cuh" +#include "iterator/cache_modified_input_iterator.cuh" +#include "iterator/cache_modified_output_iterator.cuh" +#include "iterator/constant_input_iterator.cuh" +#include "iterator/counting_input_iterator.cuh" +#include "iterator/tex_obj_input_iterator.cuh" +#include "iterator/tex_ref_input_iterator.cuh" +#include "iterator/transform_input_iterator.cuh" + +// Util +#include "util_arch.cuh" +#include "util_debug.cuh" +#include "util_device.cuh" +#include "util_macro.cuh" +#include "util_ptx.cuh" +#include "util_type.cuh" + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_histogram.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_histogram.cuh new file mode 100644 index 0000000..db131ee --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_histogram.cuh @@ -0,0 +1,866 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. + */ + +#pragma once + +#include +#include +#include + +#include "dispatch/dispatch_histogram.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png) + * \ingroup SingleModule + * + * \par Overview + * A histogram + * counts the number of observations that fall into each of the disjoint categories (known as bins). + * + * \par Usage Considerations + * \cdp_class{DeviceHistogram} + * + */ +struct DeviceHistogram +{ + /******************************************************************//** + * \name Evenly-segmented bin ranges + *********************************************************************/ + //@{ + + /** + * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. + * + * \par + * - The number of histogram bins is (\p num_levels - 1) + * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of a six-bin histogram + * from a sequence of float samples + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_samples; // e.g., 10 + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels; // e.g., 7 (seven level boundaries for six bins) + * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) + * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. + LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. + OffsetT num_samples, ///< [in] The number of input samples (i.e., the length of \p d_samples) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT lower_level1[1] = {lower_level}; + LevelT upper_level1[1] = {upper_level}; + + return MultiHistogramEven<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + lower_level1, + upper_level1, + num_samples, + 1, + sizeof(SampleT) * num_samples, + stream, + debug_synchronous); + } + + + /** + * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. + * + * \par + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins is (\p num_levels - 1) + * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of a six-bin histogram + * from a 2x5 region of interest within a flattened 2x7 array of float samples. + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_row_samples; // e.g., 5 + * int num_rows; // e.g., 2; + * size_t row_stride_bytes; // e.g., 7 * sizeof(float) + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, + * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels; // e.g., 7 (seven level boundaries for six bins) + * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) + * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_samples, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_samples, num_rows, row_stride_bytes); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. + LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. + OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT lower_level1[1] = {lower_level}; + LevelT upper_level1[1] = {upper_level}; + + return MultiHistogramEven<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + lower_level1, + upper_level1, + num_row_samples, + num_rows, + row_stride_bytes, + stream, + debug_synchronous); + } + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA + * pixel samples). + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 256-bin RGB histograms + * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_pixels; // e.g., 5 + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), + * // (0, 6, 7, 5), (3, 0, 2, 6)] + * int* d_histogram[3]; // e.g., three device pointers to three device buffers, + * // each allocated with 256 integer counters + * int num_levels[3]; // e.g., {257, 257, 257}; + * unsigned int lower_level[3]; // e.g., {0, 0, 0}; + * unsigned int upper_level[3]; // e.g., {256, 256, 256}; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); + * + * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], + * // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], + * // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + return MultiHistogramEven( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram, + num_levels, + lower_level, + upper_level, + num_pixels, + 1, + sizeof(SampleT) * NUM_CHANNELS * num_pixels, + stream, + debug_synchronous); + } + + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA + * pixel samples). + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 256-bin RGB histograms from a 2x3 region of + * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_row_pixels; // e.g., 3 + * int num_rows; // e.g., 2 + * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), + * // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] + * int* d_histogram[3]; // e.g., three device pointers to three device buffers, + * // each allocated with 256 integer counters + * int num_levels[3]; // e.g., {257, 257, 257}; + * unsigned int lower_level[3]; // e.g., {0, 0, 0}; + * unsigned int upper_level[3]; // e.g., {256, 256, 256}; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_pixels, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_pixels, num_rows, row_stride_bytes); + * + * // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], + * // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], + * // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + Int2Type is_byte_sample; + + if ((sizeof(OffsetT) > sizeof(int)) && + ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) + { + // Down-convert OffsetT data type + + + return DipatchHistogram::DispatchEven( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, + (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + return DipatchHistogram::DispatchEven( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, + num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + + //@} end member group + /******************************************************************//** + * \name Custom bin ranges + *********************************************************************/ + //@{ + + /** + * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. + * + * \par + * - The number of histogram bins is (\p num_levels - 1) + * - The value range for bini is [level[i], level[i+1]) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of an six-bin histogram + * from a sequence of float samples + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_samples; // e.g., 10 + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels // e.g., 7 (seven level boundaries for six bins) + * float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_samples); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_samples, ///< [in] The number of data samples per row in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT* d_levels1[1] = {d_levels}; + + return MultiHistogramRange<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + d_levels1, + num_samples, + 1, + sizeof(SampleT) * num_samples, + stream, + debug_synchronous); + } + + + /** + * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. + * + * \par + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins is (\p num_levels - 1) + * - The value range for bini is [level[i], level[i+1]) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of a six-bin histogram + * from a 2x5 region of interest within a flattened 2x7 array of float samples. + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_row_samples; // e.g., 5 + * int num_rows; // e.g., 2; + * int row_stride_bytes; // e.g., 7 * sizeof(float) + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, + * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] + * int* d_histogram; // e.g., [ , , , , , , , ] + * int num_levels // e.g., 7 (seven level boundaries for six bins) + * float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, + * num_row_samples, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, + * num_row_samples, num_rows, row_stride_bytes); + * + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; + * + * \endcode + * + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t HistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT* d_levels1[1] = {d_levels}; + + return MultiHistogramRange<1, 1>( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram1, + num_levels1, + d_levels1, + num_row_samples, + num_rows, + row_stride_bytes, + stream, + debug_synchronous); + } + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA + * pixel samples). + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 4-bin RGB histograms + * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_pixels; // e.g., 5 + * unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), + * // (0, 6, 7, 5),(3, 0, 2, 6)] + * unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; + * int num_levels[3]; // e.g., {5, 5, 5}; + * unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8] ]; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_pixels); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_pixels); + * + * // d_histogram <-- [ [1, 3, 0, 1], + * // [3, 0, 0, 2], + * // [0, 2, 0, 3] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + return MultiHistogramRange( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histogram, + num_levels, + d_levels, + num_pixels, + 1, + sizeof(SampleT) * NUM_CHANNELS * num_pixels, + stream, + debug_synchronous); + } + + + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA + * pixel samples). + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 4-bin RGB histograms from a 2x3 region of + * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_row_pixels; // e.g., 3 + * int num_rows; // e.g., 2 + * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), + * // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] + * int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; + * int num_levels[3]; // e.g., {5, 5, 5}; + * unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8] ]; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); + * + * // d_histogram <-- [ [2, 3, 0, 1], + * // [3, 0, 0, 2], + * // [1, 2, 0, 3] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + Int2Type is_byte_sample; + + if ((sizeof(OffsetT) > sizeof(int)) && + ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) + { + // Down-convert OffsetT data type + return DipatchHistogram::DispatchRange( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, + (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + return DipatchHistogram::DispatchRange( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, + num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + + + //@} end member group +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_partition.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_partition.cuh new file mode 100644 index 0000000..154506e --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_partition.cuh @@ -0,0 +1,273 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_select_if.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png) + * \ingroup SingleModule + * + * \par Overview + * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from + * a specified input sequence. + * + * \par Usage Considerations + * \cdp_class{DevicePartition} + * + * \par Performance + * \linear_performance{partition} + * + * \par + * The following chart illustrates DevicePartition::If + * performance across different CUDA architectures for \p int32 items, + * where 50% of the items are randomly selected for the first partition. + * \plots_below + * + * \image html partition_if_int32_50_percent.png + * + */ +struct DevicePartition +{ + /** + * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png) + * + * \par + * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). + * - Copies of the selected items are compacted into \p d_out and maintain their original + * relative ordering, however copies of the unselected items are compacted into the + * rear of \p d_out in reverse order. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] + * // d_num_selected_out <-- [4] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename FlagIterator, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Flagged( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) + int num_items, ///< [in] Total number of items to select from + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType SelectOp; // Selection op (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png) + * + * \par + * - Copies of the selected items are compacted into \p d_out and maintain their original + * relative ordering, however copies of the unselected items are compacted into the + * rear of \p d_out in reverse order. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated partition-if performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Items are + * selected for the first partition with 50% probability. + * + * \image html partition_if_int32_50_percent.png + * \image html partition_if_int64_50_percent.png + * + * \par + * The following charts are similar, but 5% selection probability for the first partition: + * + * \image html partition_if_int32_5_percent.png + * \image html partition_if_int64_5_percent.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Functor type for selecting values less than some criteria + * struct LessThan + * { + * int compare; + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * LessThan(int compare) : compare(compare) {} + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * bool operator()(const int &a) const { + * return (a < compare); + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * LessThan select_op(7); + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + * \tparam SelectOp [inferred] Selection functor type having member bool operator()(const T &a) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT, + typename SelectOp> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t If( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) + int num_items, ///< [in] Total number of items to select from + SelectOp select_op, ///< [in] Unary selection operator + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + select_op, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_partition_flagged.cu + * \example example_device_partition_if.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_radix_sort.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_radix_sort.cuh new file mode 100644 index 0000000..fe6cad6 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_radix_sort.cuh @@ -0,0 +1,796 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_radix_sort.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png) + * \ingroup SingleModule + * + * \par Overview + * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending (or descending) order. The algorithm relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * + * \par + * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: + * unsigned char, \p int, \p double, etc. Although the direct radix sorting + * method can only be applied to unsigned integral types, DeviceRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * + * \par Usage Considerations + * \cdp_class{DeviceRadixSort} + * + * \par Performance + * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys + * performance across different CUDA architectures for uniform-random \p uint32 keys. + * \plots_below + * + * \image html lsb_radix_sort_int32_keys.png + * + */ +struct DeviceRadixSort +{ + + /******************************************************************//** + * \name KeyT-value pairs + *********************************************************************/ + //@{ + + /** + * \brief Sorts key-value pairs into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random uint32,uint32 and + * uint64,uint64 pairs, respectively. + * + * \image html lsb_radix_sort_int32_pairs.png + * \image html lsb_radix_sort_int64_pairs.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] + * // d_values_out <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into ascending order. (~N auxiliary storage required) + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random uint32,uint32 and + * uint64,uint64 pairs, respectively. + * + * \image html lsb_radix_sort_int32_pairs.png + * \image html lsb_radix_sort_int64_pairs.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] + * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortPairs. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0] + * // d_values_out <-- [6, 0, 2, 1, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortPairs. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] + * // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Keys-only + *********************************************************************/ + //@{ + + + /** + * \brief Sorts keys into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. + * + * \image html lsb_radix_sort_int32_keys.png + * \image html lsb_radix_sort_int64_keys.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts keys into ascending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. + * + * \image html lsb_radix_sort_int32_keys.png + * \image html lsb_radix_sort_int64_keys.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + /** + * \brief Sorts keys into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortKeys. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts keys into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortKeys. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + + +}; + +/** + * \example example_device_radix_sort.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_reduce.cuh new file mode 100644 index 0000000..3939a7e --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_reduce.cuh @@ -0,0 +1,734 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include +#include + +#include "../iterator/arg_index_input_iterator.cuh" +#include "dispatch/dispatch_reduce.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png) + * \ingroup SingleModule + * + * \par Overview + * A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a sequence of input elements. + * + * \par Usage Considerations + * \cdp_class{DeviceReduce} + * + * \par Performance + * \linear_performance{reduction, reduce-by-key, and run-length encode} + * + * \par + * The following chart illustrates DeviceReduce::Sum + * performance across different CUDA architectures for \p int32 keys. + * + * \image html reduce_int32.png + * + * \par + * The following chart illustrates DeviceReduce::ReduceByKey (summation) + * performance across different CUDA architectures for \p fp32 + * values. Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. + * + * \image html reduce_by_key_fp32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceReduce +{ + /** + * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init. + * + * \par + * - Does not support binary reduction operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * __device__ __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * CustomMin min_op; + * int init; // e.g., INT_MAX + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduction + * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); + * + * // d_out <-- [0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ReductionOpT, + typename T> + CUB_RUNTIME_FUNCTION + static cudaError_t Reduce( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + T init, ///< [in] Initial value of the reduction + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + reduction_op, + init, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide sum using the addition (\p +) operator. + * + * \par + * - Uses \p 0 as the initial value of the reduction. + * - Does not support \p + operators that are non-commutative.. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sum-reduction performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. + * + * \image html reduce_int32.png + * \image html reduce_int64.png + * + * \par Snippet + * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sum-reduction + * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [38] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Sum( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Sum(), + OutputT(), // zero-initialize + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide minimum using the less-than ('<') operator. + * + * \par + * - Uses std::numeric_limits::max() as the initial value of the reduction. + * - Does not support \p < operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run min-reduction + * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Min( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Min(), + Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item. + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The minimum is written to d_out.value and its offset in the input array is written to d_out.key. + * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs + * - Does not support \p < operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmin-reduction + * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + * + * // d_out <-- [{5, 0}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMin( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_items, + cub::ArgMin(), + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide maximum using the greater-than ('>') operator. + * + * \par + * - Uses std::numeric_limits::lowest() as the initial value of the reduction. + * - Does not support \p > operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run max-reduction + * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + * + * // d_out <-- [9] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Max( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Max(), + Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The maximum is written to d_out.value and its offset in the input array is written to d_out.key. + * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs + * - Does not support \p > operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmax-reduction + * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + * + * // d_out <-- [{6, 9}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMax( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + + return DispatchReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_items, + cub::ArgMax(), + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys. + * + * \par + * This operation computes segmented reductions within \p d_values_in using + * the specified binary \p reduction_op functor. The segments are identified by + * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of + * consecutive, identical keys. For the ith run encountered, + * the first key of the run and the corresponding value aggregate of that run are + * written to d_unique_out[i] and d_aggregates_out[i], + * respectively. The total number of runs encountered is written to \p d_num_runs_out. + * + * \par + * - The == equality operator is used to determine whether keys are equivalent + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Performance + * The following chart illustrates reduction-by-key (sum) performance across + * different CUDA architectures for \p fp32 and \p fp64 values, respectively. Segments + * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. + * + * \image html reduce_by_key_fp32_len_500.png + * \image html reduce_by_key_fp64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html reduce_by_key_fp32_len_5.png + * \image html reduce_by_key_fp64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the segmented reduction of \p int values grouped + * by runs of associated \p int keys. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] + * int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] + * int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] + * int *d_num_runs_out; // e.g., [-] + * CustomMin reduction_op; + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduce-by-key + * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); + * + * // d_unique_out <-- [0, 2, 9, 5, 8] + * // d_aggregates_out <-- [0, 1, 6, 2, 4] + * // d_num_runs_out <-- [5] + * + * \endcode + * + * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading input keys \iterator + * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output keys \iterator + * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading input values \iterator + * \tparam AggregatesOutputIterator [inferred] Random-access output iterator type for writing output value aggregates \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template < + typename KeysInputIteratorT, + typename UniqueOutputIteratorT, + typename ValuesInputIteratorT, + typename AggregatesOutputIteratorT, + typename NumRunsOutputIteratorT, + typename ReductionOpT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t ReduceByKey( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // FlagT iterator type (not used) + + // Selection op (not used) + + // Default == operator + typedef Equality EqualityOp; + + return DispatchReduceByKey::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + EqualityOp(), + reduction_op, + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_reduce.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_run_length_encode.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_run_length_encode.cuh new file mode 100644 index 0000000..ed0bf9c --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_run_length_encode.cuh @@ -0,0 +1,278 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_rle.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png) + * \ingroup SingleModule + * + * \par Overview + * A run-length encoding + * computes a simple compressed representation of a sequence of input elements such that each + * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a + * count of the elements in that run. + * + * \par Usage Considerations + * \cdp_class{DeviceRunLengthEncode} + * + * \par Performance + * \linear_performance{run-length encode} + * + * \par + * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across + * different CUDA architectures for \p int32 items. + * Segments have lengths uniformly sampled from [1,1000]. + * + * \image html rle_int32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceRunLengthEncode +{ + + /** + * \brief Computes a run-length encoding of the sequence \p d_in. + * + * \par + * - For the ith run encountered, the first key of the run and its length are written to + * d_unique_out[i] and d_counts_out[i], + * respectively. + * - The total number of runs encountered is written to \p d_num_runs_out. + * - The == equality operator is used to determine whether values are equivalent + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated encode performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have + * lengths uniformly sampled from [1,1000]. + * + * \image html rle_int32_len_500.png + * \image html rle_int64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html rle_int32_len_5.png + * \image html rle_int64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the run-length encoding of a sequence of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_unique_out; // e.g., [ , , , , , , , ] + * int *d_counts_out; // e.g., [ , , , , , , , ] + * int *d_num_runs_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run encoding + * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); + * + * // d_unique_out <-- [0, 2, 9, 5, 8] + * // d_counts_out <-- [1, 2, 1, 3, 1] + * // d_num_runs_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output items \iterator + * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing output counts \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + */ + template < + typename InputIteratorT, + typename UniqueOutputIteratorT, + typename LengthsOutputIteratorT, + typename NumRunsOutputIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Encode( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + LengthsOutputIteratorT d_counts_out, ///< [out] Pointer to the output sequence of run-lengths (one count per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType SelectOp; // Selection op (not used) + typedef Equality EqualityOp; // Default == operator + typedef cub::Sum ReductionOp; // Value reduction operator + + // The lengths output value type + typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? + OffsetT, // ... then the OffsetT type, + typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type + + // Generator type for providing 1s values for run-length reduction + typedef ConstantInputIterator LengthsInputIteratorT; + + return DispatchReduceByKey::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_unique_out, + LengthsInputIteratorT((LengthT) 1), + d_counts_out, + d_num_runs_out, + EqualityOp(), + ReductionOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in. + * + * \par + * - For the ith non-trivial run, the run's starting offset + * and its length are written to d_offsets_out[i] and + * d_lengths_out[i], respectively. + * - The total number of runs encountered is written to \p d_num_runs_out. + * - The == equality operator is used to determine whether values are equivalent + * - \devicestorage + * + * \par Performance + * + * \par Snippet + * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_offsets_out; // e.g., [ , , , , , , , ] + * int *d_lengths_out; // e.g., [ , , , , , , , ] + * int *d_num_runs_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run encoding + * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); + * + * // d_offsets_out <-- [1, 4] + * // d_lengths_out <-- [2, 3] + * // d_num_runs_out <-- [2] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OffsetsOutputIteratorT [inferred] Random-access output iterator type for writing run-offset values \iterator + * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing run-length values \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + */ + template < + typename InputIteratorT, + typename OffsetsOutputIteratorT, + typename LengthsOutputIteratorT, + typename NumRunsOutputIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t NonTrivialRuns( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run) + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef Equality EqualityOp; // Default == operator + + return DeviceRleDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_scan.cuh new file mode 100644 index 0000000..4589279 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_scan.cuh @@ -0,0 +1,443 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_scan.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png) + * \ingroup SingleModule + * + * \par Overview + * Given a sequence of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output sequence where each element is computed to be the reduction + * of the elements occurring earlier in the input sequence. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * + * \par + * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our "decoupled look-back" algorithm + * for performing global prefix scan with only a single pass through the + * input data, as described in our 2016 technical report [1]. The central + * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies + * of global prefix propagation with local computation. As such, our algorithm requires only + * ~2n data movement (n inputs are read, n outputs are written), and typically + * proceeds at "memcpy" speeds. + * + * \par + * [1] [Duane Merrill and Michael Garland. "Single-pass Parallel Prefix Scan with Decoupled Look-back", NVIDIA Technical Report NVR-2016-002, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back) + * + * \par Usage Considerations + * \cdp_class{DeviceScan} + * + * \par Performance + * \linear_performance{prefix scan} + * + * \par + * The following chart illustrates DeviceScan::ExclusiveSum + * performance across different CUDA architectures for \p int32 keys. + * \plots_below + * + * \image html scan_int32.png + * + */ +struct DeviceScan +{ + /******************************************************************//** + * \name Exclusive scans + *********************************************************************/ + //@{ + + /** + * \brief Computes a device-wide exclusive prefix sum. The value of 0 is applied as the initial value, and is assigned to *d_out. + * + * \par + * - Supports non-commutative sum operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated exclusive sum performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. + * + * \image html scan_int32.png + * \image html scan_int64.png + * + * \par Snippet + * The code snippet below illustrates the exclusive prefix sum of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix sum + * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out s<-- [0, 8, 14, 21, 26, 29, 29] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ExclusiveSum( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Initial value + OutputT init_value = 0; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + Sum(), + init_value, + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor. The \p init_value value is applied as the initial value, and is assigned to *d_out. + * + * \par + * - Supports non-commutative scan operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op + * ... + * + * // Determine temporary device storage requirements for exclusive prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); + * + * // Allocate temporary storage for exclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix min-scan + * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); + * + * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam Identity [inferred] Type of the \p identity value used Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ScanOpT, + typename InitValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t ExclusiveScan( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out) + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init_value, + num_items, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive scans + *********************************************************************/ + //@{ + + + /** + * \brief Computes a device-wide inclusive prefix sum. + * + * \par + * - Supports non-commutative sum operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the inclusive prefix sum of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements for inclusive prefix sum + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage for inclusive prefix sum + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix sum + * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [8, 14, 21, 26, 29, 29, 38] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t InclusiveSum( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + Sum(), + NullType(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor. + * + * \par + * - Supports non-commutative scan operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for inclusive prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); + * + * // Allocate temporary storage for inclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix min-scan + * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); + * + * // d_out <-- [8, 6, 6, 5, 3, 0, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ScanOpT> + CUB_RUNTIME_FUNCTION + static cudaError_t InclusiveScan( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + NullType(), + num_items, + stream, + debug_synchronous); + } + + //@} end member group + +}; + +/** + * \example example_device_scan.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_radix_sort.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_radix_sort.cuh new file mode 100644 index 0000000..7f8bf8e --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_radix_sort.cuh @@ -0,0 +1,875 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_radix_sort.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png) + * \ingroup SegmentedModule + * + * \par Overview + * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending (or descending) order. The algorithm relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * + * \par + * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: + * unsigned char, \p int, \p double, etc. Although the direct radix sorting + * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * + * \par Usage Considerations + * \cdp_class{DeviceSegmentedRadixSort} + * + */ +struct DeviceSegmentedRadixSort +{ + + /******************************************************************//** + * \name Key-value pairs + *********************************************************************/ + //@{ + + /** + * \brief Sorts segments of key-value pairs into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into ascending order. (~N auxiliary storage required) + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Keys-only + *********************************************************************/ + //@{ + + + /** + * \brief Sorts segments of keys into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of keys into ascending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + /** + * \brief Sorts segments of keys into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of keys into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_reduce.cuh new file mode 100644 index 0000000..1964ec1 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_reduce.cuh @@ -0,0 +1,619 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../iterator/arg_index_input_iterator.cuh" +#include "dispatch/dispatch_reduce.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png) + * \ingroup SegmentedModule + * + * \par Overview + * A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a sequence of input elements. + * + * \par Usage Considerations + * \cdp_class{DeviceSegmentedReduce} + * + */ +struct DeviceSegmentedReduce +{ + /** + * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor. + * + * \par + * - Does not support binary reduction operators that are non-commutative. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * CustomMin min_op; + * int initial_value; // e.g., INT_MAX + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduction + * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + * + * // d_out <-- [6, INT_MAX, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT, + typename ReductionOp, + typename T> + CUB_RUNTIME_FUNCTION + static cudaError_t Reduce( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOp reduction_op, ///< [in] Binary reduction functor + T initial_value, ///< [in] Initial value of the reduction for each segment + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + reduction_op, + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented sum using the addition ('+') operator. + * + * \par + * - Uses \p 0 as the initial value of the reduction for each segment. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p + operators that are non-commutative.. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the sum reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sum-reduction + * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [21, 0, 17] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Sum( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Sum(), + OutputT(), // zero-initialize + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented minimum using the less-than ('<') operator. + * + * \par + * - Uses std::numeric_limits::max() as the initial value of the reduction for each segment. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p < operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run min-reduction + * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [6, INT_MAX, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Min( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Min(), + Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item. + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The minimum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. + * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p < operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmin-reduction + * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMin( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::ArgMin(), + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator. + * + * \par + * - Uses std::numeric_limits::lowest() as the initial value of the reduction. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p > operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run max-reduction + * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [8, INT_MIN, 9] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Max( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Max(), + Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The maximum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. + * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p > operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmax-reduction + * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMax( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::ArgMax(), + initial_value, + stream, + debug_synchronous); + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_select.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_select.cuh new file mode 100644 index 0000000..58bfe82 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_select.cuh @@ -0,0 +1,369 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_select_if.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png) + * \ingroup SingleModule + * + * \par Overview + * These operations apply a selection criterion to selectively copy + * items from a specified input sequence to a compact output sequence. + * + * \par Usage Considerations + * \cdp_class{DeviceSelect} + * + * \par Performance + * \linear_performance{select-flagged, select-if, and select-unique} + * + * \par + * The following chart illustrates DeviceSelect::If + * performance across different CUDA architectures for \p int32 items, + * where 50% of the items are randomly selected. + * + * \image html select_if_int32_50_percent.png + * + * \par + * The following chart illustrates DeviceSelect::Unique + * performance across different CUDA architectures for \p int32 items + * where segments have lengths uniformly sampled from [1,1000]. + * + * \image html select_unique_int32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceSelect +{ + /** + * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png) + * + * \par + * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [1, 4, 6, 7] + * // d_num_selected_out <-- [4] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename FlagIterator, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Flagged( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType SelectOp; // Selection op (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png) + * + * \par + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated select-if performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Items are + * selected with 50% probability. + * + * \image html select_if_int32_50_percent.png + * \image html select_if_int64_50_percent.png + * + * \par + * The following charts are similar, but 5% selection probability: + * + * \image html select_if_int32_5_percent.png + * \image html select_if_int64_5_percent.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Functor type for selecting values less than some criteria + * struct LessThan + * { + * int compare; + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * LessThan(int compare) : compare(compare) {} + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * bool operator()(const int &a) const { + * return (a < compare); + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * LessThan select_op(7); + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // d_out <-- [0, 2, 3, 5, 2] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + * \tparam SelectOp [inferred] Selection operator type having member bool operator()(const T &a) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT, + typename SelectOp> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t If( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + SelectOp select_op, ///< [in] Unary selection operator + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + select_op, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png) + * + * \par + * - The == equality operator is used to determine whether keys are equivalent + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated select-unique performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have + * lengths uniformly sampled from [1,1000]. + * + * \image html select_unique_int32_len_500.png + * \image html select_unique_int64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html select_unique_int32_len_5.png + * \image html select_unique_int64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [0, 2, 9, 5, 8] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Unique( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType SelectOp; // Selection op (not used) + typedef Equality EqualityOp; // Default == operator + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_select_flagged.cu + * \example example_device_select_if.cu + * \example example_device_select_unique.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_spmv.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_spmv.cuh new file mode 100644 index 0000000..8f3a4c5 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_spmv.cuh @@ -0,0 +1,174 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + */ + +#pragma once + +#include +#include +#include + +#include "dispatch/dispatch_spmv_orig.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV). + * \ingroup SingleModule + * + * \par Overview + * The [SpMV computation](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication) + * performs the matrix-vector operation + * y = alpha*A*x + beta*y, + * where: + * - A is an mxn sparse matrix whose non-zero structure is specified in + * [compressed-storage-row (CSR) format](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29) + * (i.e., three arrays: values, row_offsets, and column_indices) + * - x and y are dense vectors + * - alpha and beta are scalar multiplicands + * + * \par Usage Considerations + * \cdp_class{DeviceSpmv} + * + */ +struct DeviceSpmv +{ + /******************************************************************//** + * \name CSR matrix operations + *********************************************************************/ + //@{ + + /** + * \brief This function performs the matrix-vector operation y = A*x. + * + * \par Snippet + * The code snippet below illustrates SpMV upon a 9x9 CSR matrix A + * representing a 3x3 lattice (24 non-zeros). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x, + * // and output vector y + * int num_rows = 9; + * int num_cols = 9; + * int num_nonzeros = 24; + * + * float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, + * // 1, 1, 1, 1, 1, 1, 1, 1, + * // 1, 1, 1, 1, 1, 1, 1, 1] + * + * int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, + * // 4, 6, 1, 3, 5, 7, 2, 4, + * // 8, 3, 7, 4, 6, 8, 5, 7] + * + * int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] + * + * float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] + * float* d_vector_y; // e.g., [ , , , , , , , , ] + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, + * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, + * num_rows, num_cols, num_nonzeros, alpha, beta); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run SpMV + * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, + * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, + * num_rows, num_cols, num_nonzeros, alpha, beta); + * + * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] + * + * \endcode + * + * \tparam ValueT [inferred] Matrix and vector value type (e.g., /p float, /p double, etc.) + */ + template < + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t CsrMV( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros) + int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x + ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y + int num_rows, ///< [in] number of rows of matrix A. + int num_cols, ///< [in] number of columns of matrix A. + int num_nonzeros, ///< [in] number of nonzero elements of matrix A. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + SpmvParams spmv_params; + spmv_params.d_values = d_values; + spmv_params.d_row_end_offsets = d_row_offsets + 1; + spmv_params.d_column_indices = d_column_indices; + spmv_params.d_vector_x = d_vector_x; + spmv_params.d_vector_y = d_vector_y; + spmv_params.num_rows = num_rows; + spmv_params.num_cols = num_cols; + spmv_params.num_nonzeros = num_nonzeros; + spmv_params.alpha = 1.0; + spmv_params.beta = 0.0; + + return DispatchSpmv::Dispatch( + d_temp_storage, + temp_storage_bytes, + spmv_params, + stream, + debug_synchronous); + } + + //@} end member group +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_histogram.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_histogram.cuh new file mode 100644 index 0000000..cdebd8b --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_histogram.cuh @@ -0,0 +1,1096 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. + */ + +#pragma once + +#include +#include +#include + +#include "../../agent/agent_histogram.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../thread/thread_search.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/****************************************************************************** + * Histogram kernel entry points + *****************************************************************************/ + +/** + * Histogram initialization kernel entry point + */ +template < + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename OffsetT> ///< Signed integer type for global offsets +__global__ void DeviceHistogramInitKernel( + ArrayWrapper num_output_bins_wrapper, ///< Number of output histogram bins per channel + ArrayWrapper d_output_histograms_wrapper, ///< Histogram counter data having logical dimensions CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]] + GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + if ((threadIdx.x == 0) && (blockIdx.x == 0)) + tile_queue.ResetDrain(); + + int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x; + + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + if (output_bin < num_output_bins_wrapper.array[CHANNEL]) + d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0; + } +} + + +/** + * Histogram privatized sweep kernel entry point (multi-block). Computes privatized histograms, one per thread block. + */ +template < + typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type + int PRIVATIZED_SMEM_BINS, ///< Maximum number of histogram bins per channel (e.g., up to 256) + int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SampleIteratorT, ///< The input iterator type. \iterator. + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel + typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS)) +__global__ void DeviceHistogramSweepKernel( + SampleIteratorT d_samples, ///< Input data to reduce + ArrayWrapper num_output_bins_wrapper, ///< The number bins per final output histogram + ArrayWrapper num_privatized_bins_wrapper, ///< The number bins per privatized histogram + ArrayWrapper d_output_histograms_wrapper, ///< Reference to final output histograms + ArrayWrapper d_privatized_histograms_wrapper, ///< Reference to privatized histograms + ArrayWrapper output_decode_op_wrapper, ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel + ArrayWrapper privatized_decode_op_wrapper, ///< The transform operator for determining privatized counter indices from samples, one for each channel + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Thread block type for compositing input tiles + typedef AgentHistogram< + AgentHistogramPolicyT, + PRIVATIZED_SMEM_BINS, + NUM_CHANNELS, + NUM_ACTIVE_CHANNELS, + SampleIteratorT, + CounterT, + PrivatizedDecodeOpT, + OutputDecodeOpT, + OffsetT> + AgentHistogramT; + + // Shared memory for AgentHistogram + __shared__ typename AgentHistogramT::TempStorage temp_storage; + + AgentHistogramT agent( + temp_storage, + d_samples, + num_output_bins_wrapper.array, + num_privatized_bins_wrapper.array, + d_output_histograms_wrapper.array, + d_privatized_histograms_wrapper.array, + output_decode_op_wrapper.array, + privatized_decode_op_wrapper.array); + + // Initialize counters + agent.InitBinCounters(); + + // Consume input tiles + agent.ConsumeTiles( + num_row_pixels, + num_rows, + row_stride_samples, + tiles_per_row, + tile_queue); + + // Store output to global (if necessary) + agent.StoreOutput(); + +} + + + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram + */ +template < + int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SampleIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename LevelT, ///< Type for specifying bin level boundaries + typename OffsetT> ///< Signed integer type for global offsets +struct DipatchHistogram +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + enum + { + // Maximum number of bins per channel for which we will use a privatized smem strategy + MAX_PRIVATIZED_SMEM_BINS = 256 + }; + + + //--------------------------------------------------------------------- + // Transform functors for converting samples to bin-ids + //--------------------------------------------------------------------- + + // Searches for bin given a list of bin-boundary levels + template + struct SearchTransform + { + LevelIteratorT d_levels; // Pointer to levels array + int num_output_levels; // Number of levels in array + + // Initializer + __host__ __device__ __forceinline__ void Init( + LevelIteratorT d_levels, // Pointer to levels array + int num_output_levels) // Number of levels in array + { + this->d_levels = d_levels; + this->num_output_levels = num_output_levels; + } + + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + /// Level iterator wrapper type + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + LevelIteratorT>::Type // Directly use the supplied input iterator type + WrappedLevelIteratorT; + + WrappedLevelIteratorT wrapped_levels(d_levels); + + int num_bins = num_output_levels - 1; + if (valid) + { + bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1; + if (bin >= num_bins) + bin = -1; + } + } + }; + + + // Scales samples to evenly-spaced bins + struct ScaleTransform + { + int num_bins; // Number of levels in array + LevelT max; // Max sample level (exclusive) + LevelT min; // Min sample level (inclusive) + LevelT scale; // Bin scaling factor + + // Initializer + template + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + _LevelT max, // Max sample level (exclusive) + _LevelT min, // Min sample level (inclusive) + _LevelT scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = scale; + } + + // Initializer (float specialization) + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + float max, // Max sample level (exclusive) + float min, // Min sample level (inclusive) + float scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = float(1.0) / scale; + } + + // Initializer (double specialization) + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + double max, // Max sample level (exclusive) + double min, // Min sample level (inclusive) + double scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = double(1.0) / scale; + } + + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) / scale); + } + + // Method for converting samples to bin-ids (float specialization) + template + __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) * scale); + } + + // Method for converting samples to bin-ids (double specialization) + template + __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) * scale); + } + }; + + + // Pass-through bin transform operator + struct PassThruTransform + { + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + if (valid) + bin = (int) sample; + } + }; + + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + template + struct TScale + { + enum + { + V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), + VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1) + }; + }; + + + /// SM11 + struct Policy110 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 512, + (NUM_CHANNELS == 1) ? 8 : 2, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + true, + GMEM, + false> + HistogramSweepPolicy; + }; + + /// SM20 + struct Policy200 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + (NUM_CHANNELS == 1) ? 256 : 128, + (NUM_CHANNELS == 1) ? 8 : 3, + (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + SMEM, + false> + HistogramSweepPolicy; + }; + + /// SM30 + struct Policy300 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 512, + (NUM_CHANNELS == 1) ? 8 : 2, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + true, + GMEM, + false> + HistogramSweepPolicy; + }; + + /// SM35 + struct Policy350 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 128, + TScale<8>::VALUE, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLEND, + true> + HistogramSweepPolicy; + }; + + /// SM50 + struct Policy500 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 384, + TScale<16>::VALUE, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + SMEM, + false> + HistogramSweepPolicy; + }; + + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 500) + typedef Policy500 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t InitConfigs( + int ptx_version, + KernelConfig &histogram_sweep_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + return histogram_sweep_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 500) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 350) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 300) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 200) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 110) + { + return histogram_sweep_config.template Init(); + } + else + { + // No global atomic support + return cudaErrorNotSupported; + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration + */ + struct KernelConfig + { + int block_threads; + int pixels_per_thread; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Init() + { + block_threads = BlockPolicy::BLOCK_THREADS; + pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD; + + return cudaSuccess; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Privatization-based dispatch routine + */ + template < + typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel + typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel + typename DeviceHistogramInitKernelT, ///< Function type of cub::DeviceHistogramInitKernel + typename DeviceHistogramSweepKernelT> ///< Function type of cub::DeviceHistogramSweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t PrivatizedDispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_privatized_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel + int max_num_output_bins, ///< [in] Maximum number of output bins in any channel + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel + DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel + KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + #ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + + #else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get SM occupancy for histogram_sweep_kernel + int histogram_sweep_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + histogram_sweep_sm_occupancy, + histogram_sweep_kernel, + histogram_sweep_config.block_threads))) break; + + // Get device occupancy for histogram_sweep_kernel + int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count; + + if (num_row_pixels * NUM_CHANNELS == row_stride_samples) + { + // Treat as a single linear array of samples + num_row_pixels *= num_rows; + num_rows = 1; + row_stride_samples = num_row_pixels * NUM_CHANNELS; + } + + // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy + int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread; + int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile; + int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row); + int blocks_per_col = (blocks_per_row > 0) ? + int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : + 0; + int num_thread_blocks = blocks_per_row * blocks_per_col; + + dim3 sweep_grid_dims; + sweep_grid_dims.x = (unsigned int) blocks_per_row; + sweep_grid_dims.y = (unsigned int) blocks_per_col; + sweep_grid_dims.z = 1; + + // Temporary storage allocation requirements + const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1; + void* allocations[NUM_ALLOCATIONS]; + size_t allocation_sizes[NUM_ALLOCATIONS]; + + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT); + + allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue::AllocationSize(); + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the grid queue descriptor + GridQueue tile_queue(allocations[NUM_ALLOCATIONS - 1]); + + // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper d_output_histograms_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL]; + + // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper d_privatized_histograms_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL]; + + // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters) + ArrayWrapper privatized_decode_op_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL]; + + // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters) + ArrayWrapper output_decode_op_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL]; + + // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters) + ArrayWrapper num_privatized_bins_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1; + + // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters) + ArrayWrapper num_output_bins_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1; + + int histogram_init_block_threads = 256; + int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads; + + // Log DeviceHistogramInitKernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n", + histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); + + // Invoke histogram_init_kernel + histogram_init_kernel<<>>( + num_output_bins_wrapper, + d_output_histograms_wrapper, + tile_queue); + + // Return if empty problem + if ((blocks_per_row == 0) || (blocks_per_col == 0)) + break; + + // Log histogram_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n", + sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z, + histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy); + + // Invoke histogram_sweep_kernel + histogram_sweep_kernel<<>>( + d_samples, + num_output_bins_wrapper, + num_privatized_bins_wrapper, + d_output_histograms_wrapper, + d_privatized_histograms_wrapper, + output_decode_op_wrapper, + privatized_decode_op_wrapper, + num_row_pixels, + num_rows, + row_stride_samples, + tiles_per_row, + tile_queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + } + while (0); + + return error; + + #endif // CUB_RUNTIME_ENABLED + } + + + + /** + * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit + */ + CUB_RUNTIME_FUNCTION + static cudaError_t DispatchRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the search transform op for converting samples to privatized bins + typedef SearchTransform PrivatizedDecodeOpT; + + // Use the pass-thru transform op for converting privatized bins to output bins + typedef PassThruTransform OutputDecodeOpT; + + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + // Dispatch + if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) + { + // Too many bins to keep in shared memory. + const int PRIVATIZED_SMEM_BINS = 0; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + else + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + + } while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) + */ + CUB_RUNTIME_FUNCTION + static cudaError_t DispatchRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the pass-thru transform op for converting samples to privatized bins + typedef PassThruTransform PrivatizedDecodeOpT; + + // Use the search transform op for converting privatized bins to output bins + typedef SearchTransform OutputDecodeOpT; + + int num_privatized_levels[NUM_ACTIVE_CHANNELS]; + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; // Maximum number of levels in any channel + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + num_privatized_levels[channel] = 257; + output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + const int PRIVATIZED_SMEM_BINS = 256; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_privatized_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + + } while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t DispatchEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the scale transform op for converting samples to privatized bins + typedef ScaleTransform PrivatizedDecodeOpT; + + // Use the pass-thru transform op for converting privatized bins to output bins + typedef PassThruTransform OutputDecodeOpT; + + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + int bins = num_output_levels[channel] - 1; + LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; + + privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = 0; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + else + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + } + while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t DispatchEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the pass-thru transform op for converting samples to privatized bins + typedef PassThruTransform PrivatizedDecodeOpT; + + // Use the scale transform op for converting privatized bins to output bins + typedef ScaleTransform OutputDecodeOpT; + + int num_privatized_levels[NUM_ACTIVE_CHANNELS]; + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + num_privatized_levels[channel] = 257; + + int bins = num_output_levels[channel] - 1; + LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; + output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + const int PRIVATIZED_SMEM_BINS = 256; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_privatized_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + + } + while (0); + + return error; + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_radix_sort.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_radix_sort.cuh new file mode 100644 index 0000000..b4559ad --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_radix_sort.cuh @@ -0,0 +1,1657 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_radix_sort_upsweep.cuh" +#include "../../agent/agent_radix_sort_downsweep.cuh" +#include "../../agent/agent_scan.cuh" +#include "../../block/block_radix_sort.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../util_type.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Upsweep digit-counting kernel entry point (multi-block). Computes privatized digit histograms, one per block. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)) +__global__ void DeviceRadixSortUpsweepKernel( + const KeyT *d_keys, ///< [in] Input keys buffer + OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + OffsetT /*num_items*/, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block +{ + enum { + TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD + }; + + // Parameterize AgentRadixSortUpsweep type for the current configuration + typedef AgentRadixSortUpsweep< + typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy, + typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type, + KeyT, + OffsetT> + AgentRadixSortUpsweepT; + + // Shared memory storage + __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage; + + // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block + even_share.template BlockInit(); + + AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits); + + upsweep.ProcessRegion(even_share.block_offset, even_share.block_end); + + CTA_SYNC(); + + // Write out digit counts (striped) + upsweep.ExtractCounts(d_spine, gridDim.x, blockIdx.x); +} + + +/** + * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) +__global__ void RadixSortScanBinsKernel( + OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + int num_counts) ///< [in] Total number of bin-counts +{ + // Parameterize the AgentScan type for the current configuration + typedef AgentScan< + typename ChainedPolicyT::ActivePolicy::ScanPolicy, + OffsetT*, + OffsetT*, + cub::Sum, + OffsetT, + OffsetT> + AgentScanT; + + // Shared memory storage + __shared__ typename AgentScanT::TempStorage temp_storage; + + // Block scan instance + AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ; + + // Process full input tiles + int block_offset = 0; + BlockScanRunningPrefixOp prefix_op(0, Sum()); + while (block_offset + AgentScanT::TILE_ITEMS <= num_counts) + { + block_scan.template ConsumeTile(block_offset, prefix_op); + block_offset += AgentScanT::TILE_ITEMS; + } +} + + +/** + * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)) +__global__ void DeviceRadixSortDownsweepKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetT *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + OffsetT num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block +{ + enum { + TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD + }; + + // Parameterize AgentRadixSortDownsweep type for the current configuration + typedef AgentRadixSortDownsweep< + typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy, + typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type, + IS_DESCENDING, + KeyT, + ValueT, + OffsetT> + AgentRadixSortDownsweepT; + + // Shared memory storage + __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage; + + // Initialize even-share descriptor for this thread block + even_share.template BlockInit(); + + // Process input tiles + AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion( + even_share.block_offset, + even_share.block_end); +} + + +/** + * Single pass kernel entry point (single-block). Fully sorts a tile of input. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +__global__ void DeviceRadixSortSingleTileKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetT num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int end_bit) ///< [in] The past-the-end (most-significant) bit index needed for key comparison +{ + // Constants + enum + { + BLOCK_THREADS = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD, + KEYS_ONLY = Equals::VALUE, + }; + + // BlockRadixSort type + typedef BlockRadixSort< + KeyT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ValueT, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS, + (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE), + ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM> + BlockRadixSortT; + + // BlockLoad type (keys) + typedef BlockLoad< + KeyT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys; + + // BlockLoad type (values) + typedef BlockLoad< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues; + + // Unsigned word for key bits + typedef typename Traits::UnsignedBits UnsignedBitsT; + + // Shared memory storage + __shared__ union TempStorage + { + typename BlockRadixSortT::TempStorage sort; + typename BlockLoadKeys::TempStorage load_keys; + typename BlockLoadValues::TempStorage load_values; + + } temp_storage; + + // Keys and values for the block + KeyT keys[ITEMS_PER_THREAD]; + ValueT values[ITEMS_PER_THREAD]; + + // Get default (min/max) value for out-of-bounds keys + UnsignedBitsT default_key_bits = (IS_DESCENDING) ? Traits::LOWEST_KEY : Traits::MAX_KEY; + KeyT default_key = reinterpret_cast(default_key_bits); + + // Load keys + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key); + + CTA_SYNC(); + + // Load values + if (!KEYS_ONLY) + { + BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); + + CTA_SYNC(); + } + + // Sort tile + BlockRadixSortT(temp_storage.sort).SortBlockedToStriped( + keys, + values, + current_bit, + end_bit, + Int2Type(), + Int2Type()); + + // Store keys and values + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_offset = ITEM * BLOCK_THREADS + threadIdx.x; + if (item_offset < num_items) + { + d_keys_out[item_offset] = keys[ITEM]; + if (!KEYS_ONLY) + d_values_out[item_offset] = values[ITEM]; + } + } +} + + +/** + * Segmented radix sorting pass (one block per segment) + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) +__global__ void DeviceSegmentedRadixSortKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data + int current_bit, ///< [in] Bit position of current radix digit + int pass_bits) ///< [in] Number of bits of current radix digit +{ + // + // Constants + // + + typedef typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy, + typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT; + + enum + { + BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD, + RADIX_BITS = SegmentedPolicyT::RADIX_BITS, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + RADIX_DIGITS = 1 << RADIX_BITS, + KEYS_ONLY = Equals::VALUE, + }; + + // Upsweep type + typedef AgentRadixSortUpsweep< + AgentRadixSortUpsweepPolicy, + KeyT, + OffsetT> + BlockUpsweepT; + + // Digit-scan type + typedef BlockScan DigitScanT; + + // Downsweep type + typedef AgentRadixSortDownsweep BlockDownsweepT; + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD + }; + + // + // Process input tiles + // + + // Shared memory storage + __shared__ union + { + typename BlockUpsweepT::TempStorage upsweep; + typename BlockDownsweepT::TempStorage downsweep; + struct + { + volatile OffsetT reverse_counts_in[RADIX_DIGITS]; + volatile OffsetT reverse_counts_out[RADIX_DIGITS]; + typename DigitScanT::TempStorage scan; + }; + + } temp_storage; + + OffsetT segment_begin = d_begin_offsets[blockIdx.x]; + OffsetT segment_end = d_end_offsets[blockIdx.x]; + OffsetT num_items = segment_end - segment_begin; + + // Check if empty segment + if (num_items <= 0) + return; + + // Upsweep + BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits); + upsweep.ProcessRegion(segment_begin, segment_end); + + CTA_SYNC(); + + // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) + OffsetT bin_count[BINS_TRACKED_PER_THREAD]; + upsweep.ExtractCounts(bin_count); + + CTA_SYNC(); + + if (IS_DESCENDING) + { + // Reverse bin counts + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + temp_storage.reverse_counts_in[bin_idx] = bin_count[track]; + } + + CTA_SYNC(); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1]; + } + } + + // Scan + OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads) + DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + bin_offset[track] += segment_begin; + } + + if (IS_DESCENDING) + { + // Reverse bin offsets + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track]; + } + + CTA_SYNC(); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1]; + } + } + + CTA_SYNC(); + + // Downsweep + BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits); + downsweep.ProcessRegion(segment_begin, segment_end); +} + + + +/****************************************************************************** + * Policy + ******************************************************************************/ + +/** + * Tuning policy for kernel specialization + */ +template < + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +struct DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + + // Relative size of KeyT type to a 4-byte word + SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, + }; + + //------------------------------------------------------------------------------ + // Architecture-specific tuning policies + //------------------------------------------------------------------------------ + + /// SM13 + struct Policy130 : ChainedPolicy<130, Policy130, Policy130> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + }; + + // Keys-only upsweep policies + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; + + // Key-value pairs upsweep policies + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; + + // Upsweep policies + typedef typename If::Type UpsweepPolicy; + typedef typename If::Type AltUpsweepPolicy; + + // Scan policy + typedef AgentScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + /// SM20 + struct Policy200 : ChainedPolicy<200, Policy200, Policy130> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + }; + + // Keys-only upsweep policies + typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; + typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; + + // Key-value pairs upsweep policies + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; + + // Upsweep policies + typedef typename If::Type UpsweepPolicy; + typedef typename If::Type AltUpsweepPolicy; + + // Scan policy + typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + /// SM30 + struct Policy300 : ChainedPolicy<300, Policy300, Policy200> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + }; + + // Keys-only upsweep policies + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; + + // Key-value pairs upsweep policies + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; + + // Upsweep policies + typedef typename If::Type UpsweepPolicy; + typedef typename If::Type AltUpsweepPolicy; + + // Scan policy + typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + { + enum { + PRIMARY_RADIX_BITS = 6, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m) + }; + + // Scan policy + typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 9 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef DownsweepPolicyKeys DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + + + }; + + + /// SM50 + struct Policy500 : ChainedPolicy<500, Policy500, Policy350> + { + enum { + PRIMARY_RADIX_BITS = 7, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX) + SINGLE_TILE_RADIX_BITS = 6, + SEGMENTED_RADIX_BITS = 6, // 3.1B 32b segmented keys/s (TitanX) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy <160, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 31 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; + }; + + + /// SM60 (GP100) + struct Policy600 : ChainedPolicy<600, Policy600, Policy500> + { + enum { + PRIMARY_RADIX_BITS = 7, // 6.9B 32b keys/s (Quadro P100) + SINGLE_TILE_RADIX_BITS = 6, + SEGMENTED_RADIX_BITS = 6, // 5.9B 32b segmented keys/s (Quadro P100) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 25 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; + + }; + + + /// SM61 (GP104) + struct Policy610 : ChainedPolicy<610, Policy610, Policy600> + { + enum { + PRIMARY_RADIX_BITS = 7, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080) + SINGLE_TILE_RADIX_BITS = 6, + SEGMENTED_RADIX_BITS = 6, // 3.3B 32b segmented keys/s (1080) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 31 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 35 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; + + // Upsweep policies + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS> UpsweepPolicy; + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; + }; + + + /// SM62 (Tegra, less RF) + struct Policy620 : ChainedPolicy<620, Policy620, Policy610> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS> AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM70 (GV100) + struct Policy700 : ChainedPolicy<700, Policy700, Policy620> + { + enum { + PRIMARY_RADIX_BITS = 6, // 7.62B 32b keys/s (GV100) + SINGLE_TILE_RADIX_BITS = 6, + SEGMENTED_RADIX_BITS = 6, // 8.7B 32b segmented keys/s (GV100) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 47 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 29 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; + + // Upsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 47 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> UpsweepPolicy; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 29 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; + }; + + + /// MaxPolicy + typedef Policy700 MaxPolicy; + + +}; + + + +/****************************************************************************** + * Single-problem dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort + */ +template < + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchRadixSort : + DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + }; + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items; ///< [in] Number of items to sort + int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers + + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchRadixSort( + void* d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + OffsetT num_items, + int begin_bit, + int end_bit, + bool is_overwrite_okay, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_keys(d_keys), + d_values(d_values), + num_items(num_items), + begin_bit(begin_bit), + end_bit(end_bit), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version), + is_overwrite_okay(is_overwrite_okay) + {} + + + //------------------------------------------------------------------------------ + // Small-problem (single tile) invocation + //------------------------------------------------------------------------------ + + /// Invoke a single block to sort in-core + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SingleTileKernelT> ///< Function type of cub::DeviceRadixSortSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeSingleTile( + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)single_tile_kernel; + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + break; + } + + // Return if empty problem + if (num_items == 0) + break; + + // Log single_tile_kernel configuration + if (debug_synchronous) + _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS); + + // Invoke upsweep_kernel with same grid size as downsweep_kernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_keys.Current(), + d_keys.Alternate(), + d_values.Current(), + d_values.Alternate(), + num_items, + begin_bit, + end_bit); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update selector + d_keys.selector ^= 1; + d_values.selector ^= 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Normal problem size invocation + //------------------------------------------------------------------------------ + + /** + * Invoke a three-kernel sorting pass at the current bit. + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePass( + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + OffsetT *d_spine, + int spine_length, + int ¤t_bit, + PassConfigT &pass_config) + { + cudaError error = cudaSuccess; + do + { + int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); + + // Log upsweep_kernel configuration + if (debug_synchronous) + _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream, + pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits); + + /*printf("running upsweep kernel with pass_bits %d\n", pass_bits);*/ + /*printf("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",*/ + /*pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,*/ + /*pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);*/ + + // Invoke upsweep_kernel with same grid size as downsweep_kernel + pass_config.upsweep_kernel<<>>( + d_keys_in, + d_spine, + num_items, + current_bit, + pass_bits, + pass_config.even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log scan_kernel configuration + if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread); + + // Invoke scan_kernel + pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>( + d_spine, + spine_length); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log downsweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream, + pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy); + + // Invoke downsweep_kernel + pass_config.downsweep_kernel<<>>( + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + d_spine, + num_items, + current_bit, + pass_bits, + pass_config.even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update current bit + current_bit += pass_bits; + } + while (0); + + return error; + } + + + + /// Pass configuration structure + template < + typename UpsweepKernelT, + typename ScanKernelT, + typename DownsweepKernelT> + struct PassConfig + { + UpsweepKernelT upsweep_kernel; + KernelConfig upsweep_config; + ScanKernelT scan_kernel; + KernelConfig scan_config; + DownsweepKernelT downsweep_kernel; + KernelConfig downsweep_config; + int radix_bits; + int radix_digits; + int max_downsweep_grid_size; + GridEvenShare even_share; + + /// Initialize pass configuration + template < + typename UpsweepPolicyT, + typename ScanPolicyT, + typename DownsweepPolicyT> + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InitPassConfig( + UpsweepKernelT upsweep_kernel, + ScanKernelT scan_kernel, + DownsweepKernelT downsweep_kernel, + int ptx_version, + int sm_count, + int num_items) + { + cudaError error = cudaSuccess; + do + { + this->upsweep_kernel = upsweep_kernel; + this->scan_kernel = scan_kernel; + this->downsweep_kernel = downsweep_kernel; + radix_bits = DownsweepPolicyT::RADIX_BITS; + radix_digits = 1 << radix_bits; + + if (CubDebug(error = upsweep_config.Init(upsweep_kernel))) break; + if (CubDebug(error = scan_config.Init(scan_kernel))) break; + if (CubDebug(error = downsweep_config.Init(downsweep_kernel))) break; + + max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version); + + even_share.DispatchInit( + num_items, + max_downsweep_grid_size, + CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); + + } + while (0); + return error; + } + + }; + + + /// Invocation (run multiple digit passes) + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename UpsweepKernelT, ///< Function type of cub::DeviceRadixSortUpsweepKernel + typename ScanKernelT, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelT> ///< Function type of cub::DeviceRadixSortDownsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + UpsweepKernelT upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel + UpsweepKernelT alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel + ScanKernelT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel + DownsweepKernelT downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + DownsweepKernelT alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)upsweep_kernel; + (void)alt_upsweep_kernel; + (void)scan_kernel; + (void)downsweep_kernel; + (void)alt_downsweep_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Init regular and alternate-digit kernel configurations + PassConfig pass_config, alt_pass_config; + if ((error = pass_config.template InitPassConfig< + typename ActivePolicyT::UpsweepPolicy, + typename ActivePolicyT::ScanPolicy, + typename ActivePolicyT::DownsweepPolicy>( + upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break; + + if ((error = alt_pass_config.template InitPassConfig< + typename ActivePolicyT::AltUpsweepPolicy, + typename ActivePolicyT::ScanPolicy, + typename ActivePolicyT::AltDownsweepPolicy>( + alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break; + + // Get maximum spine length + int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size); + int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size; + + // Temporary storage allocation requirements + void* allocations[3]; + size_t allocation_sizes[3] = + { + spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms + (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer + (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + return cudaSuccess; + + // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size + int num_bits = end_bit - begin_bit; + int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits; + bool is_num_passes_odd = num_passes & 1; + int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits; + int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); + + // Alias the temporary storage allocations + OffsetT *d_spine = static_cast(allocations[0]); + + DoubleBuffer d_keys_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[1]), + (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_keys.Alternate()); + + DoubleBuffer d_values_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[2]), + (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[2]) : d_values.Alternate()); + + // Run first pass, consuming from the input's current buffers + int current_bit = begin_bit; + if (CubDebug(error = InvokePass( + d_keys.Current(), d_keys_remaining_passes.Current(), + d_values.Current(), d_values_remaining_passes.Current(), + d_spine, spine_length, current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Run remaining passes + while (current_bit < end_bit) + { + if (CubDebug(error = InvokePass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_spine, spine_length, current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;; + + // Invert selectors + d_keys_remaining_passes.selector ^= 1; + d_values_remaining_passes.selector ^= 1; + } + + // Update selector + if (!is_overwrite_okay) { + num_passes = 1; // Sorted data always ends up in the other vector + } + + d_keys.selector = (d_keys.selector + num_passes) & 1; + d_values.selector = (d_values.selector + num_passes) & 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; + typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; + + // Force kernel code-generation in all compiler passes + if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) + { + // Small, single tile size + return InvokeSingleTile( + DeviceRadixSortSingleTileKernel); + } + else + { + // Regular size + return InvokePasses( + DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>, + DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>, + RadixSortScanBinsKernel< MaxPolicyT, OffsetT>, + DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>, + DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>); + } + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items, ///< [in] Number of items to sort + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; + + cudaError_t error; + do { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchRadixSort dispatch( + d_temp_storage, temp_storage_bytes, + d_keys, d_values, + num_items, begin_bit, end_bit, is_overwrite_okay, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + + } while (0); + + return error; + } +}; + + + + +/****************************************************************************** + * Segmented dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort + */ +template < + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchSegmentedRadixSort : + DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + }; + + + //------------------------------------------------------------------------------ + // Parameter members + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items; ///< [in] Number of items to sort + OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers + + + //------------------------------------------------------------------------------ + // Constructors + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchSegmentedRadixSort( + void* d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + OffsetT num_items, + OffsetT num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + int begin_bit, + int end_bit, + bool is_overwrite_okay, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_keys(d_keys), + d_values(d_values), + num_items(num_items), + num_segments(num_segments), + d_begin_offsets(d_begin_offsets), + d_end_offsets(d_end_offsets), + begin_bit(begin_bit), + end_bit(end_bit), + is_overwrite_okay(is_overwrite_okay), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + //------------------------------------------------------------------------------ + // Multi-segment invocation + //------------------------------------------------------------------------------ + + /// Invoke a three-kernel sorting pass at the current bit. + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePass( + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + int ¤t_bit, + PassConfigT &pass_config) + { + cudaError error = cudaSuccess; + do + { + int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); + + // Log kernel configuration + if (debug_synchronous) + _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + num_segments, pass_config.segmented_config.block_threads, (long long) stream, + pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits); + + pass_config.segmented_kernel<<>>( + d_keys_in, d_keys_out, + d_values_in, d_values_out, + d_begin_offsets, d_end_offsets, num_segments, + current_bit, pass_bits); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update current bit + current_bit += pass_bits; + } + while (0); + + return error; + } + + + /// PassConfig data structure + template + struct PassConfig + { + SegmentedKernelT segmented_kernel; + KernelConfig segmented_config; + int radix_bits; + int radix_digits; + + /// Initialize pass configuration + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel) + { + this->segmented_kernel = segmented_kernel; + this->radix_bits = SegmentedPolicyT::RADIX_BITS; + this->radix_digits = 1 << radix_bits; + + return CubDebug(segmented_config.Init(segmented_kernel)); + } + }; + + + /// Invocation (run multiple digit passes) + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SegmentedKernelT> ///< Function type of cub::DeviceSegmentedRadixSortKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + SegmentedKernelT segmented_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel + SegmentedKernelT alt_segmented_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)segmented_kernel; + (void)alt_segmented_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Init regular and alternate kernel configurations + PassConfig pass_config, alt_pass_config; + if ((error = pass_config.template InitPassConfig(segmented_kernel))) break; + if ((error = alt_pass_config.template InitPassConfig(alt_segmented_kernel))) break; + + // Temporary storage allocation requirements + void* allocations[2]; + size_t allocation_sizes[2] = + { + (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer + (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + if (temp_storage_bytes == 0) + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size + int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS; + int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS; + int num_bits = end_bit - begin_bit; + int num_passes = (num_bits + radix_bits - 1) / radix_bits; + bool is_num_passes_odd = num_passes & 1; + int max_alt_passes = (num_passes * radix_bits) - num_bits; + int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits)); + + DoubleBuffer d_keys_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[0]), + (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[0]) : d_keys.Alternate()); + + DoubleBuffer d_values_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[1]), + (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_values.Alternate()); + + // Run first pass, consuming from the input's current buffers + int current_bit = begin_bit; + + if (CubDebug(error = InvokePass( + d_keys.Current(), d_keys_remaining_passes.Current(), + d_values.Current(), d_values_remaining_passes.Current(), + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Run remaining passes + while (current_bit < end_bit) + { + if (CubDebug(error = InvokePass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Invert selectors and update current bit + d_keys_remaining_passes.selector ^= 1; + d_values_remaining_passes.selector ^= 1; + } + + // Update selector + if (!is_overwrite_okay) { + num_passes = 1; // Sorted data always ends up in the other vector + } + + d_keys.selector = (d_keys.selector + num_passes) & 1; + d_values.selector = (d_values.selector + num_passes) & 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + return InvokePasses( + DeviceSegmentedRadixSortKernel, + DeviceSegmentedRadixSortKernel); + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + + /// Internal dispatch routine + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; + + cudaError_t error; + do { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchSegmentedRadixSort dispatch( + d_temp_storage, temp_storage_bytes, + d_keys, d_values, + num_items, num_segments, d_begin_offsets, d_end_offsets, + begin_bit, end_bit, is_overwrite_okay, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + + } while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce.cuh new file mode 100644 index 0000000..b6aa44c --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce.cuh @@ -0,0 +1,882 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_reduce.cuh" +#include "../../iterator/arg_index_input_iterator.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../iterator/arg_index_input_iterator.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Reduce region kernel entry point (multi-block). Computes privatized reductions, one per thread block. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +__global__ void DeviceReduceKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input data items + GridEvenShare even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block + ReductionOpT reduction_op) ///< [in] Binary reduction functor +{ + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::ReducePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Consume input tiles + OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); + + // Output result + if (threadIdx.x == 0) + d_out[blockIdx.x] = block_aggregate; +} + + +/** + * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename OuputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +__global__ void DeviceReduceSingleTileKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input data items + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OuputT init) ///< [in] The initial value of the reduction +{ + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::SingleTilePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Check if empty problem + if (num_items == 0) + { + if (threadIdx.x == 0) + *d_out = init; + return; + } + + // Consume input tiles + OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + OffsetT(0), + num_items); + + // Output result + if (threadIdx.x == 0) + *d_out = reduction_op(init, block_aggregate); +} + + +/// Normalize input iterator to segment offset +template +__device__ __forceinline__ +void NormalizeReductionOutput( + T &/*val*/, + OffsetT /*base_offset*/, + IteratorT /*itr*/) +{} + + +/// Normalize input iterator to segment offset (specialized for arg-index) +template +__device__ __forceinline__ +void NormalizeReductionOutput( + KeyValuePairT &val, + OffsetT base_offset, + ArgIndexInputIterator /*itr*/) +{ + val.key -= base_offset; +} + + +/** + * Segmented reduction (one block per segment) + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename OutputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +__global__ void DeviceSegmentedReduceKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init) ///< [in] The initial value of the reduction +{ + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::ReducePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + OffsetT segment_begin = d_begin_offsets[blockIdx.x]; + OffsetT segment_end = d_end_offsets[blockIdx.x]; + + // Check if empty problem + if (segment_begin == segment_end) + { + if (threadIdx.x == 0) + d_out[blockIdx.x] = init; + return; + } + + // Consume input tiles + OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + segment_begin, + segment_end); + + // Normalize as needed + NormalizeReductionOutput(block_aggregate, segment_begin, d_in); + + if (threadIdx.x == 0) + d_out[blockIdx.x] = reduction_op(init, block_aggregate);; +} + + + + +/****************************************************************************** + * Policy + ******************************************************************************/ + +template < + typename OuputT, ///< Data type + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DeviceReducePolicy +{ + //------------------------------------------------------------------------------ + // Architecture-specific tuning policies + //------------------------------------------------------------------------------ + + /// SM13 + struct Policy130 : ChainedPolicy<130, Policy130, Policy130> + { + // ReducePolicy + typedef AgentReducePolicy< + CUB_NOMINAL_CONFIG(128, 8, OuputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM20 + struct Policy200 : ChainedPolicy<200, Policy200, Policy130> + { + // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items) + typedef AgentReducePolicy< + CUB_NOMINAL_CONFIG(128, 8, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM30 + struct Policy300 : ChainedPolicy<300, Policy300, Policy200> + { + // ReducePolicy (GTX670: 154.0 @ 48M 4B items) + typedef AgentReducePolicy< + CUB_NOMINAL_CONFIG(256, 20, OuputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + { + // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items) + typedef AgentReducePolicy< + CUB_NOMINAL_CONFIG(256, 20, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + /// SM60 + struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + { + // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) + typedef AgentReducePolicy< + CUB_NOMINAL_CONFIG(256, 16, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// MaxPolicy + typedef Policy600 MaxPolicy; + +}; + + + +/****************************************************************************** + * Single-problem dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DispatchReduce : + DeviceReducePolicy< + typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type, // ... else the output iterator's value type + OffsetT, + ReductionOpT> +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + // Data type of output iterator + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out; ///< [out] Pointer to the output aggregate + OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op; ///< [in] Binary reduction functor + OutputT init; ///< [in] The initial value of the reduction + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchReduce( + void* d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + OutputT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_in(d_in), + d_out(d_out), + num_items(num_items), + reduction_op(reduction_op), + init(init), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + //------------------------------------------------------------------------------ + // Small-problem (single tile) invocation + //------------------------------------------------------------------------------ + + /// Invoke a single block block to reduce in-core + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeSingleTile( + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)single_tile_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + break; + } + + // Log single_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", + ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); + + // Invoke single_reduce_sweep_kernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_in, + d_out, + num_items, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Normal problem size invocation (two-pass) + //------------------------------------------------------------------------------ + + /// Invoke two-passes to reduce + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename ReduceKernelT, ///< Function type of cub::DeviceReduceKernel + typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + ReduceKernelT reduce_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void) reduce_kernel; + (void) single_tile_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Init regular kernel configuration + KernelConfig reduce_config; + if (CubDebug(error = reduce_config.Init(reduce_kernel))) break; + int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count; + + // Even-share work distribution + int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version); + GridEvenShare even_share; + even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size); + + // Temporary storage allocation requirements + void* allocations[1]; + size_t allocation_sizes[1] = + { + max_blocks * sizeof(OutputT) // bytes needed for privatized block reductions + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Alias the allocation for the privatized per-block reductions + OutputT *d_block_reductions = (OutputT*) allocations[0]; + + // Get grid size for device_reduce_sweep_kernel + int reduce_grid_size = even_share.grid_size; + + // Log device_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + reduce_grid_size, + ActivePolicyT::ReducePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD, + reduce_config.sm_occupancy); + + // Invoke DeviceReduceKernel + reduce_kernel<<>>( + d_in, + d_block_reductions, + num_items, + even_share, + reduction_op); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log single_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", + ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); + + // Invoke DeviceReduceSingleTileKernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_block_reductions, + d_out, + reduce_grid_size, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; + typedef typename DispatchReduce::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) + { + // Small, single tile size + return InvokeSingleTile( + DeviceReduceSingleTileKernel); + } + else + { + // Regular size + return InvokePasses( + DeviceReduceKernel, + DeviceReduceSingleTileKernel); + } + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init, ///< [in] The initial value of the reduction + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchReduce::MaxPolicy MaxPolicyT; + + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchReduce dispatch( + d_temp_storage, temp_storage_bytes, + d_in, d_out, num_items, reduction_op, init, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + } + while (0); + + return error; + } +}; + + + +/****************************************************************************** + * Segmented dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DispatchSegmentedReduce : + DeviceReducePolicy< + typename std::iterator_traits::value_type, + OffsetT, + ReductionOpT> +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + /// The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out; ///< [out] Pointer to the output aggregate + OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOpT reduction_op; ///< [in] Binary reduction functor + OutputT init; ///< [in] The initial value of the reduction + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchSegmentedReduce( + void* d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + ReductionOpT reduction_op, + OutputT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_in(d_in), + d_out(d_out), + num_segments(num_segments), + d_begin_offsets(d_begin_offsets), + d_end_offsets(d_end_offsets), + reduction_op(reduction_op), + init(init), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename DeviceSegmentedReduceKernelT> ///< Function type of cub::DeviceSegmentedReduceKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + DeviceSegmentedReduceKernelT segmented_reduce_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)segmented_reduce_kernel; + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Init kernel configuration + KernelConfig segmented_reduce_config; + if (CubDebug(error = segmented_reduce_config.Init(segmented_reduce_kernel))) break; + + // Log device_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + num_segments, + ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, + segmented_reduce_config.sm_occupancy); + + // Invoke DeviceReduceKernel + segmented_reduce_kernel<<>>( + d_in, + d_out, + d_begin_offsets, + d_end_offsets, + num_segments, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + + } + + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + return InvokePasses( + DeviceSegmentedReduceKernel); + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init, ///< [in] The initial value of the reduction + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; + + if (num_segments <= 0) + return cudaSuccess; + + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchSegmentedReduce dispatch( + d_temp_storage, temp_storage_bytes, + d_in, d_out, + num_segments, d_begin_offsets, d_end_offsets, + reduction_op, init, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + } + while (0); + + return error; + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh new file mode 100644 index 0000000..672bc49 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -0,0 +1,554 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_reduce_by_key.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Multi-block reduce-by-key sweep kernel entry point + */ +template < + typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicyT tuning policy type + typename KeysInputIteratorT, ///< Random-access input iterator type for keys + typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys + typename ValuesInputIteratorT, ///< Random-access input iterator type for values + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered + typename ScanTileStateT, ///< Tile status interface type + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS)) +__global__ void DeviceReduceByKeyKernel( + KeysInputIteratorT d_keys_in, ///< Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out) + ScanTileStateT tile_state, ///< Tile status interface + int start_tile, ///< The starting tile for the current grid + EqualityOpT equality_op, ///< KeyT equality operator + ReductionOpT reduction_op, ///< ValueT reduction operator + OffsetT num_items) ///< Total number of items to select from +{ + // Thread block type for reducing tiles of value segments + typedef AgentReduceByKey< + AgentReduceByKeyPolicyT, + KeysInputIteratorT, + UniqueOutputIteratorT, + ValuesInputIteratorT, + AggregatesOutputIteratorT, + NumRunsOutputIteratorT, + EqualityOpT, + ReductionOpT, + OffsetT> + AgentReduceByKeyT; + + // Shared memory for AgentReduceByKey + __shared__ typename AgentReduceByKeyT::TempStorage temp_storage; + + // Process tiles + AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange( + num_items, + tile_state, + start_tile); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey + */ +template < + typename KeysInputIteratorT, ///< Random-access input iterator type for keys + typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys + typename ValuesInputIteratorT, ///< Random-access input iterator type for values + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchReduceByKey +{ + //------------------------------------------------------------------------- + // Types and constants + //------------------------------------------------------------------------- + + // The input keys type + typedef typename std::iterator_traits::value_type KeyInputT; + + // The output keys type + typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type + + // The input values type + typedef typename std::iterator_traits::value_type ValueInputT; + + // The output values type + typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type + + enum + { + INIT_KERNEL_THREADS = 128, + MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)), + COMBINED_INPUT_BYTES = sizeof(KeyOutputT) + sizeof(ValueOutputT), + }; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + + //------------------------------------------------------------------------- + // Tuning policies + //------------------------------------------------------------------------- + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 6, + ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 6, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 11, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 7, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM11 + struct Policy110 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING> + ReduceByKeyPolicyT; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &reduce_by_key_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + reduce_by_key_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 300) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 200) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 130) + { + reduce_by_key_config.template Init(); + } + else + { + reduce_by_key_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide reduce-by-key using the + * specified kernel functions. + */ + template < + typename ScanInitKernelT, ///< Function type of cub::DeviceScanInitKernel + typename ReduceByKeyKernelT> ///< Function type of cub::DeviceReduceByKeyKernelT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + EqualityOpT equality_op, ///< [in] KeyT equality operator + ReductionOpT reduction_op, ///< [in] ValueT reduction operator + OffsetT num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + ReduceByKeyKernelT reduce_by_key_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel + KernelConfig reduce_by_key_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_keys_in; + (void)d_unique_out; + (void)d_values_in; + (void)d_aggregates_out; + (void)d_num_runs_out; + (void)equality_op; + (void)reduction_op; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)init_kernel; + (void)reduce_by_key_kernel; + (void)reduce_by_key_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors + init_kernel<<>>( + tile_state, + num_tiles, + d_num_runs_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for reduce_by_key_kernel + int reduce_by_key_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + reduce_by_key_sm_occupancy, // out + reduce_by_key_kernel, + reduce_by_key_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { + // Log reduce_by_key_kernel configuration + if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy); + + // Invoke reduce_by_key_kernel + reduce_by_key_kernel<<>>( + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + tile_state, + start_tile, + equality_op, + reduction_op, + num_items); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + EqualityOpT equality_op, ///< [in] KeyT equality operator + ReductionOpT reduction_op, ///< [in] ValueT reduction operator + OffsetT num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig reduce_by_key_config; + InitConfigs(ptx_version, reduce_by_key_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + equality_op, + reduction_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceReduceByKeyKernel, + reduce_by_key_config))) break; + } + while (0); + + return error; + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_rle.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_rle.cuh new file mode 100644 index 0000000..1de979e --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_rle.cuh @@ -0,0 +1,538 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_rle.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Select kernel entry point (multi-block) + * + * Performs functor-based selection if SelectOp functor type != NullType + * Otherwise performs flag-based selection if FlagIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator + typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator + typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator + typename ScanTileStateT, ///< Tile status interface type + typename EqualityOpT, ///< T equality operator type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS)) +__global__ void DeviceRleSweepKernel( + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + ScanTileStateT tile_status, ///< [in] Tile status interface + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + int num_tiles) ///< [in] Total number of tiles for the entire problem +{ + // Thread block type for selecting data from input tiles + typedef AgentRle< + AgentRlePolicyT, + InputIteratorT, + OffsetsOutputIteratorT, + LengthsOutputIteratorT, + EqualityOpT, + OffsetT> AgentRleT; + + // Shared memory for AgentRle + __shared__ typename AgentRleT::TempStorage temp_storage; + + // Process tiles + AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange( + num_tiles, + tile_status, + d_num_runs_out); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceRle + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator + typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator + typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator + typename EqualityOpT, ///< T equality operator type + typename OffsetT> ///< Signed integer type for global offsets +struct DeviceRleDispatch +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // The input value type + typedef typename std::iterator_traits::value_type T; + + // The lengths output value type + typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? + OffsetT, // ... then the OffsetT type, + typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 96, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLOCK_SCAN_WARP_SCANS> + RleSweepPolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + RleSweepPolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig& device_rle_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + device_rle_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 300) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 200) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 130) + { + device_rle_config.template Init(); + } + else + { + device_rle_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. Mirrors the constants within AgentRlePolicyT. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_policy; + bool store_warp_time_slicing; + BlockScanAlgorithm scan_algorithm; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = AgentRlePolicyT::BLOCK_THREADS; + items_per_thread = AgentRlePolicyT::ITEMS_PER_THREAD; + load_policy = AgentRlePolicyT::LOAD_ALGORITHM; + store_warp_time_slicing = AgentRlePolicyT::STORE_WARP_TIME_SLICING; + scan_algorithm = AgentRlePolicyT::SCAN_ALGORITHM; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d, %d", + block_threads, + items_per_thread, + load_policy, + store_warp_time_slicing, + scan_algorithm); + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide run-length-encode using the + * specified kernel functions. + */ + template < + typename DeviceScanInitKernelPtr, ///< Function type of cub::DeviceScanInitKernel + typename DeviceRleSweepKernelPtr> ///< Function type of cub::DeviceRleSweepKernelPtr + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to the output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to the output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out) + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version, ///< [in] PTX version of dispatch kernels + DeviceScanInitKernelPtr device_scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + DeviceRleSweepKernelPtr device_rle_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel + KernelConfig device_rle_config) ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log device_scan_init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors + device_scan_init_kernel<<>>( + tile_status, + num_tiles, + d_num_runs_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for device_rle_sweep_kernel + int device_rle_kernel_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + device_rle_kernel_sm_occupancy, // out + device_rle_sweep_kernel, + device_rle_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + scan_grid_size.z = 1; + scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + + // Log device_rle_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy); + + // Invoke device_rle_sweep_kernel + device_rle_sweep_kernel<<>>( + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + tile_status, + equality_op, + num_items, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig device_rle_config; + InitConfigs(ptx_version, device_rle_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + equality_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceRleSweepKernel, + device_rle_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_scan.cuh new file mode 100644 index 0000000..8944dcd --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_scan.cuh @@ -0,0 +1,563 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_scan.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_arch.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename ScanTileStateT> ///< Tile status interface type +__global__ void DeviceScanInitKernel( + ScanTileStateT tile_state, ///< [in] Tile status interface + int num_tiles) ///< [in] Number of tiles +{ + // Initialize tile status + tile_state.InitializeStatus(num_tiles); +} + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename ScanTileStateT, ///< Tile status interface type + typename NumSelectedIteratorT> ///< Output iterator type for recording the number of items selected +__global__ void DeviceCompactInitKernel( + ScanTileStateT tile_state, ///< [in] Tile status interface + int num_tiles, ///< [in] Number of tiles + NumSelectedIteratorT d_num_selected_out) ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) +{ + // Initialize tile status + tile_state.InitializeStatus(num_tiles); + + // Initialize d_num_selected_out + if ((blockIdx.x == 0) && (threadIdx.x == 0)) + *d_num_selected_out = 0; +} + + +/** + * Scan kernel entry point (multi-block) + */ +template < + typename ScanPolicyT, ///< Parameterized ScanPolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator + typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator + typename ScanTileStateT, ///< Tile status interface type + typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename InitValueT, ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans) + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS)) +__global__ void DeviceScanKernel( + InputIteratorT d_in, ///< Input data + OutputIteratorT d_out, ///< Output data + ScanTileStateT tile_state, ///< Tile status interface + int start_tile, ///< The starting tile for the current grid + ScanOpT scan_op, ///< Binary scan functor + InitValueT init_value, ///< Initial value to seed the exclusive scan + OffsetT num_items) ///< Total number of scan items for the entire problem +{ + // Thread block type for scanning input tiles + typedef AgentScan< + ScanPolicyT, + InputIteratorT, + OutputIteratorT, + ScanOpT, + InitValueT, + OffsetT> AgentScanT; + + // Shared memory for AgentScan + __shared__ typename AgentScanT::TempStorage temp_storage; + + // Process tiles + AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange( + num_items, + tile_state, + start_tile); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceScan + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator + typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator + typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename InitValueT, ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans) + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchScan +{ + //--------------------------------------------------------------------- + // Constants and Types + //--------------------------------------------------------------------- + + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + /// SM600 + struct Policy600 + { + typedef AgentScanPolicy< + CUB_NOMINAL_CONFIG(128, 15, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + /// SM520 + struct Policy520 + { + // Titan X: 32.47B items/s @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_NOMINAL_CONFIG(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + /// SM35 + struct Policy350 + { + // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_NOMINAL_CONFIG(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + BLOCK_SCAN_RAKING> + ScanPolicyT; + }; + + /// SM30 + struct Policy300 + { + typedef AgentScanPolicy< + CUB_NOMINAL_CONFIG(256, 9, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + /// SM20 + struct Policy200 + { + // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_NOMINAL_CONFIG(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + /// SM13 + struct Policy130 + { + typedef AgentScanPolicy< + CUB_NOMINAL_CONFIG(96, 21, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanPolicyT; + }; + + /// SM10 + struct Policy100 + { + typedef AgentScanPolicy< + CUB_NOMINAL_CONFIG(64, 9, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 600) + typedef Policy600 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 520) + typedef Policy520 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &scan_kernel_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + scan_kernel_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 600) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 520) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 350) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 300) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 200) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 130) + { + scan_kernel_config.template Init(); + } + else + { + scan_kernel_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel + typename ScanSweepKernelPtrT> ///< Function type of cub::DeviceScanKernelPtrT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtrT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + ScanSweepKernelPtrT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel + KernelConfig scan_kernel_config) ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_in; + (void)d_out; + (void)scan_op; + (void)init_value; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)init_kernel; + (void)scan_kernel; + (void)scan_kernel_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Return if empty problem + if (num_items == 0) + break; + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors + init_kernel<<>>( + tile_state, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for scan_kernel + int scan_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + scan_sm_occupancy, // out + scan_kernel, + scan_kernel_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { + // Log scan_kernel configuration + if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy); + + // Invoke scan_kernel + scan_kernel<<>>( + d_in, + d_out, + tile_state, + start_tile, + scan_op, + init_value, + num_items); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Get kernel kernel dispatch configurations + KernelConfig scan_kernel_config; + InitConfigs(ptx_version, scan_kernel_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init_value, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceScanInitKernel, + DeviceScanKernel, + scan_kernel_config))) break; + } + while (0); + + return error; + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_select_if.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_select_if.cuh new file mode 100644 index 0000000..6f03319 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_select_if.cuh @@ -0,0 +1,542 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_select_if.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Select kernel entry point (multi-block) + * + * Performs functor-based selection if SelectOpT functor type != NullType + * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading input items + typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename ScanTileStateT, ///< Tile status interface type + typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename OffsetT, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS)) +__global__ void DeviceSelectSweepKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + ScanTileStateT tile_status, ///< [in] Tile status interface + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + int num_tiles) ///< [in] Total number of tiles for the entire problem +{ + // Thread block type for selecting data from input tiles + typedef AgentSelectIf< + AgentSelectIfPolicyT, + InputIteratorT, + FlagsInputIteratorT, + SelectedOutputIteratorT, + SelectOpT, + EqualityOpT, + OffsetT, + KEEP_REJECTS> AgentSelectIfT; + + // Shared memory for AgentSelectIf + __shared__ typename AgentSelectIfT::TempStorage temp_storage; + + // Process tiles + AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange( + num_tiles, + tile_status, + d_num_selected_out); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items + typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename OffsetT, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct DispatchSelectIf +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // The flag value type + typedef typename std::iterator_traits::value_type FlagT; + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 10, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 7, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING_MEMOIZE> + SelectIfPolicyT; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING> + SelectIfPolicyT; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &select_if_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + select_if_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + select_if_config.template Init(); + } + else if (ptx_version >= 300) + { + select_if_config.template Init(); + } + else if (ptx_version >= 200) + { + select_if_config.template Init(); + } + else if (ptx_version >= 130) + { + select_if_config.template Init(); + } + else + { + select_if_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide selection using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel + typename SelectIfKernelPtrT> ///< Function type of cub::SelectIfKernelPtrT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtrT scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + SelectIfKernelPtrT select_if_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel + KernelConfig select_if_config) ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_in; + (void)d_flags; + (void)d_selected_out; + (void)d_num_selected_out; + (void)select_op; + (void)equality_op; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)scan_init_kernel; + (void)select_if_kernel; + (void)select_if_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = select_if_config.block_threads * select_if_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log scan_init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke scan_init_kernel to initialize tile descriptors + scan_init_kernel<<>>( + tile_status, + num_tiles, + d_num_selected_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for select_if_kernel + int range_select_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + range_select_sm_occupancy, // out + select_if_kernel, + select_if_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + scan_grid_size.z = 1; + scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + + // Log select_if_kernel configuration + if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy); + + // Invoke select_if_kernel + select_if_kernel<<>>( + d_in, + d_flags, + d_selected_out, + d_num_selected_out, + tile_status, + select_op, + equality_op, + num_items, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig select_if_config; + InitConfigs(ptx_version, select_if_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_selected_out, + d_num_selected_out, + select_op, + equality_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceSelectSweepKernel, + select_if_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_spmv_orig.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_spmv_orig.cuh new file mode 100644 index 0000000..267bbe2 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -0,0 +1,850 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + */ + +#pragma once + +#include +#include + +#include "../../agent/single_pass_scan_operators.cuh" +#include "../../agent/agent_segment_fixup.cuh" +#include "../../agent/agent_spmv_orig.cuh" +#include "../../util_type.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../thread/thread_search.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * SpMV kernel entry points + *****************************************************************************/ + +/** + * Spmv search kernel. Identifies merge path starting coordinates for each tile. + */ +template < + typename AgentSpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename ValueT, ///< Matrix and vector value type + typename OffsetT> ///< Signed integer type for sequence offsets +__global__ void DeviceSpmv1ColKernel( + SpmvParams spmv_params) ///< [in] SpMV input parameter bundle +{ + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, + ValueT, + OffsetT> + VectorValueIteratorT; + + VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); + + int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (row_idx < spmv_params.num_rows) + { + OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; + OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; + + ValueT value = 0.0; + if (end_nonzero_idx != nonzero_idx) + { + value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; + } + + spmv_params.d_vector_y[row_idx] = value; + } +} + + +/** + * Spmv search kernel. Identifies merge path starting coordinates for each tile. + */ +template < + typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename OffsetT, ///< Signed integer type for sequence offsets + typename CoordinateT, ///< Merge path coordinate type + typename SpmvParamsT> ///< SpmvParams type +__global__ void DeviceSpmvSearchKernel( + int num_merge_tiles, ///< [in] Number of SpMV merge tiles (spmv grid size) + CoordinateT* d_tile_coordinates, ///< [out] Pointer to the temporary array of tile starting coordinates + SpmvParamsT spmv_params) ///< [in] SpMV input parameter bundle +{ + /// Constants + enum + { + BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + typedef CacheModifiedInputIterator< + SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, + OffsetT, + OffsetT> + RowOffsetsSearchIteratorT; + + // Find the starting coordinate for all tiles (plus the end coordinate of the last one) + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_merge_tiles + 1) + { + OffsetT diagonal = (tile_idx * TILE_ITEMS); + CoordinateT tile_coordinate; + CountingInputIterator nonzero_indices(0); + + // Search the merge path + MergePathSearch( + diagonal, + RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), + nonzero_indices, + spmv_params.num_rows, + spmv_params.num_nonzeros, + tile_coordinate); + + // Output starting offset + d_tile_coordinates[tile_idx] = tile_coordinate; + } +} + + +/** + * Spmv agent entry point + */ +template < + typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename ScanTileStateT, ///< Tile status interface type + typename ValueT, ///< Matrix and vector value type + typename OffsetT, ///< Signed integer type for sequence offsets + typename CoordinateT, ///< Merge path coordinate type + bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1 + bool HAS_BETA> ///< Whether the input parameter Beta is 0 +__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS)) +__global__ void DeviceSpmvKernel( + SpmvParams spmv_params, ///< [in] SpMV input parameter bundle + CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates + KeyValuePair* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block + int num_tiles, ///< [in] Number of merge tiles + ScanTileStateT tile_state, ///< [in] Tile status interface for fixup reduce-by-key kernel + int num_segment_fixup_tiles) ///< [in] Number of reduce-by-key tiles (fixup grid size) +{ + // Spmv agent type specialization + typedef AgentSpmv< + SpmvPolicyT, + ValueT, + OffsetT, + HAS_ALPHA, + HAS_BETA> + AgentSpmvT; + + // Shared memory for AgentSpmv + __shared__ typename AgentSpmvT::TempStorage temp_storage; + + AgentSpmvT(temp_storage, spmv_params).ConsumeTile( + d_tile_coordinates, + d_tile_carry_pairs, + num_tiles); + + // Initialize fixup tile status + tile_state.InitializeStatus(num_segment_fixup_tiles); + +} + + +/** + * Multi-block reduce-by-key sweep kernel entry point + */ +template < + typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type + typename PairsInputIteratorT, ///< Random-access input iterator type for keys + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename OffsetT, ///< Signed integer type for global offsets + typename ScanTileStateT> ///< Tile status interface type +__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) +__global__ void DeviceSegmentFixupKernel( + PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block + AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates + OffsetT num_items, ///< [in] Total number of items to select from + int num_tiles, ///< [in] Total number of tiles for the entire problem + ScanTileStateT tile_state) ///< [in] Tile status interface +{ + // Thread block type for reducing tiles of value segments + typedef AgentSegmentFixup< + AgentSegmentFixupPolicyT, + PairsInputIteratorT, + AggregatesOutputIteratorT, + cub::Equality, + cub::Sum, + OffsetT> + AgentSegmentFixupT; + + // Shared memory for AgentSegmentFixup + __shared__ typename AgentSegmentFixupT::TempStorage temp_storage; + + // Process tiles + AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange( + num_items, + num_tiles, + tile_state); +} + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv + */ +template < + typename ValueT, ///< Matrix and vector value type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchSpmv +{ + //--------------------------------------------------------------------- + // Constants and Types + //--------------------------------------------------------------------- + + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // SpmvParams bundle type + typedef SpmvParams SpmvParamsT; + + // 2D merge path coordinate type + typedef typename CubVector::Type CoordinateT; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Tuple type for scanning (pairs accumulated segment-value with segment-index) + typedef KeyValuePair KeyValuePairT; + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + /// SM11 + struct Policy110 + { + typedef AgentSpmvPolicy< + 128, + 1, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + /// SM20 + struct Policy200 + { + typedef AgentSpmvPolicy< + 96, + 18, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_RAKING> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + + }; + + + + /// SM30 + struct Policy300 + { + typedef AgentSpmvPolicy< + 96, + 6, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + + }; + + + /// SM35 + struct Policy350 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 96 : 128, + (sizeof(ValueT) > 4) ? 4 : 7, + LOAD_LDG, + LOAD_CA, + LOAD_LDG, + LOAD_LDG, + LOAD_LDG, + (sizeof(ValueT) > 4) ? true : false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + + /// SM37 + struct Policy370 + { + + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 128 : 128, + (sizeof(ValueT) > 4) ? 9 : 14, + LOAD_LDG, + LOAD_CA, + LOAD_LDG, + LOAD_LDG, + LOAD_LDG, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + /// SM50 + struct Policy500 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 64 : 128, + (sizeof(ValueT) > 4) ? 6 : 7, + LOAD_LDG, + LOAD_DEFAULT, + (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, + (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, + LOAD_LDG, + (sizeof(ValueT) > 4) ? true : false, + (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE> + SpmvPolicyT; + + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_RAKING_MEMOIZE> + SegmentFixupPolicyT; + }; + + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 500) + typedef Policy500 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 370) + typedef Policy370 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {}; + struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &spmv_config, + KernelConfig &segment_fixup_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + spmv_config.template Init(); + segment_fixup_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 500) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 370) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 350) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 300) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + + } + else if (ptx_version >= 200) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide reduction using the + * specified kernel functions. + * + * If the input is larger than a single tile, this method uses two-passes of + * kernel invocations. + */ + template < + typename Spmv1ColKernelT, ///< Function type of cub::DeviceSpmv1ColKernel + typename SpmvSearchKernelT, ///< Function type of cub::AgentSpmvSearchKernel + typename SpmvKernelT, ///< Function type of cub::AgentSpmvKernel + typename SegmentFixupKernelT> ///< Function type of cub::DeviceSegmentFixupKernelT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SpmvParamsT& spmv_params, ///< SpMV input parameter bundle + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + Spmv1ColKernelT spmv_1col_kernel, ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel + SpmvSearchKernelT spmv_search_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel + SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel + SegmentFixupKernelT segment_fixup_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel + KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for + KernelConfig segment_fixup_config) ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for + { +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); + +#else + cudaError error = cudaSuccess; + do + { + if (spmv_params.num_cols == 1) + { + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + temp_storage_bytes = 1; + break; + } + + // Get search/init grid dims + int degen_col_kernel_block_size = INIT_KERNEL_THREADS; + int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size; + + if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", + degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); + + // Invoke spmv_search_kernel + spmv_1col_kernel<<>>( + spmv_params); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + break; + } + + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Total number of spmv work items + int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros; + + // Tile sizes of kernels + int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread; + int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread; + + // Number of tiles for kernels + unsigned int num_merge_tiles = (num_merge_items + merge_tile_size - 1) / merge_tile_size; + unsigned int num_segment_fixup_tiles = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size; + + // Get SM occupancy for kernels + int spmv_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + spmv_sm_occupancy, + spmv_kernel, + spmv_config.block_threads))) break; + + int segment_fixup_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + segment_fixup_sm_occupancy, + segment_fixup_kernel, + segment_fixup_config.block_threads))) break; + + // Get grid dimensions + dim3 spmv_grid_size( + CUB_MIN(num_merge_tiles, max_dim_x), + (num_merge_tiles + max_dim_x - 1) / max_dim_x, + 1); + + dim3 segment_fixup_grid_size( + CUB_MIN(num_segment_fixup_tiles, max_dim_x), + (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x, + 1); + + // Get the temporary storage allocation requirements + size_t allocation_sizes[3]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break; // bytes needed for reduce-by-key tile status descriptors + allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs + allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + void* allocations[3]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break; + + // Alias the other allocations + KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs + CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates + + // Get search/init grid dims + int search_block_size = INIT_KERNEL_THREADS; + int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size; + +#if (CUB_PTX_ARCH == 0) + // Init textures + if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break; +#endif + + if (search_grid_size < sm_count) +// if (num_merge_tiles < spmv_sm_occupancy * sm_count) + { + // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords + d_tile_coordinates = NULL; + } + else + { + // Use separate search kernel if we have enough spmv tiles to saturate the device + + // Log spmv_search_kernel configuration + if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", + search_grid_size, search_block_size, (long long) stream); + + // Invoke spmv_search_kernel + spmv_search_kernel<<>>( + num_merge_tiles, + d_tile_coordinates, + spmv_params); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + + // Log spmv_kernel configuration + if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); + + // Invoke spmv_kernel + spmv_kernel<<>>( + spmv_params, + d_tile_coordinates, + d_tile_carry_pairs, + num_merge_tiles, + tile_state, + num_segment_fixup_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Run reduce-by-key fixup if necessary + if (num_merge_tiles > 1) + { + // Log segment_fixup_kernel configuration + if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); + + // Invoke segment_fixup_kernel + segment_fixup_kernel<<>>( + d_tile_carry_pairs, + spmv_params.d_vector_y, + num_merge_tiles, + num_segment_fixup_tiles, + tile_state); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + +#if (CUB_PTX_ARCH == 0) + // Free textures + if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break; +#endif + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SpmvParamsT& spmv_params, ///< SpMV input parameter bundle + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig spmv_config, segment_fixup_config; + InitConfigs(ptx_version, spmv_config, segment_fixup_config); + + if (CubDebug(error = Dispatch( + d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, + DeviceSpmv1ColKernel, + DeviceSpmvSearchKernel, + DeviceSpmvKernel, + DeviceSegmentFixupKernel, + spmv_config, segment_fixup_config))) break; + +/* + // Dispatch + if (spmv_params.beta == 0.0) + { + if (spmv_params.alpha == 1.0) + { + // Dispatch y = A*x + if (CubDebug(error = Dispatch( + d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, + DeviceSpmv1ColKernel, + DeviceSpmvSearchKernel, + DeviceSpmvKernel, + DeviceSegmentFixupKernel, + spmv_config, segment_fixup_config))) break; + } + else + { + // Dispatch y = alpha*A*x + if (CubDebug(error = Dispatch( + d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, + DeviceSpmvSearchKernel, + DeviceSpmvKernel, + DeviceSegmentFixupKernel, + spmv_config, segment_fixup_config))) break; + } + } + else + { + if (spmv_params.alpha == 1.0) + { + // Dispatch y = A*x + beta*y + if (CubDebug(error = Dispatch( + d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, + DeviceSpmvSearchKernel, + DeviceSpmvKernel, + DeviceSegmentFixupKernel, + spmv_config, segment_fixup_config))) break; + } + else + { + // Dispatch y = alpha*A*x + beta*y + if (CubDebug(error = Dispatch( + d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, + DeviceSpmvSearchKernel, + DeviceSpmvKernel, + DeviceSegmentFixupKernel, + spmv_config, segment_fixup_config))) break; + } + } +*/ + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_barrier.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_barrier.cuh new file mode 100644 index 0000000..d9f8336 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_barrier.cuh @@ -0,0 +1,211 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid + */ + +#pragma once + +#include "../util_debug.cuh" +#include "../util_namespace.cuh" +#include "../thread/thread_load.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid + */ +class GridBarrier +{ +protected : + + typedef unsigned int SyncFlag; + + // Counters in global device memory + SyncFlag* d_sync; + +public: + + /** + * Constructor + */ + GridBarrier() : d_sync(NULL) {} + + + /** + * Synchronize + */ + __device__ __forceinline__ void Sync() const + { + volatile SyncFlag *d_vol_sync = d_sync; + + // Threadfence and syncthreads to make sure global writes are visible before + // thread-0 reports in with its sync counter + __threadfence(); + CTA_SYNC(); + + if (blockIdx.x == 0) + { + // Report in ourselves + if (threadIdx.x == 0) + { + d_vol_sync[blockIdx.x] = 1; + } + + CTA_SYNC(); + + // Wait for everyone else to report in + for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) + { + while (ThreadLoad(d_sync + peer_block) == 0) + { + __threadfence_block(); + } + } + + CTA_SYNC(); + + // Let everyone know it's safe to proceed + for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) + { + d_vol_sync[peer_block] = 0; + } + } + else + { + if (threadIdx.x == 0) + { + // Report in + d_vol_sync[blockIdx.x] = 1; + + // Wait for acknowledgment + while (ThreadLoad(d_sync + blockIdx.x) == 1) + { + __threadfence_block(); + } + } + + CTA_SYNC(); + } + } +}; + + +/** + * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. + * + * Uses RAII for lifetime, i.e., device resources are reclaimed when + * the destructor is called. + */ +class GridBarrierLifetime : public GridBarrier +{ +protected: + + // Number of bytes backed by d_sync + size_t sync_bytes; + +public: + + /** + * Constructor + */ + GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} + + + /** + * DeviceFrees and resets the progress counters + */ + cudaError_t HostReset() + { + cudaError_t retval = cudaSuccess; + if (d_sync) + { + CubDebug(retval = cudaFree(d_sync)); + d_sync = NULL; + } + sync_bytes = 0; + return retval; + } + + + /** + * Destructor + */ + virtual ~GridBarrierLifetime() + { + HostReset(); + } + + + /** + * Sets up the progress counters for the next kernel launch (lazily + * allocating and initializing them if necessary) + */ + cudaError_t Setup(int sweep_grid_size) + { + cudaError_t retval = cudaSuccess; + do { + size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); + if (new_sync_bytes > sync_bytes) + { + if (d_sync) + { + if (CubDebug(retval = cudaFree(d_sync))) break; + } + + sync_bytes = new_sync_bytes; + + // Allocate and initialize to zero + if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; + if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; + } + } while (0); + + return retval; + } +}; + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_even_share.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_even_share.cuh new file mode 100644 index 0000000..3ba29da --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_even_share.cuh @@ -0,0 +1,222 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units (grains). + */ + + +#pragma once + +#include "../util_namespace.cuh" +#include "../util_macro.cuh" +#include "grid_mapping.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridEvenShare is a descriptor utility for distributing input among + * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly + * the same number of input tiles. + * + * \par Overview + * Each thread block is assigned a consecutive sequence of input tiles. To help + * preserve alignment and eliminate the overhead of guarded loads for all but the + * last thread block, to GridEvenShare assigns one of three different amounts of + * work to a given thread block: "big", "normal", or "last". The "big" workloads + * are one scheduling grain larger than "normal". The "last" work unit for the + * last thread block may be partially-full if the input is not an even multiple of + * the scheduling grain size. + * + * \par + * Before invoking a child grid, a parent thread will typically construct an + * instance of GridEvenShare. The instance can be passed to child thread blocks + * which can initialize their per-thread block offsets using \p BlockInit(). + */ +template +struct GridEvenShare +{ +private: + + OffsetT total_tiles; + int big_shares; + OffsetT big_share_items; + OffsetT normal_share_items; + OffsetT normal_base_offset; + +public: + + /// Total number of input items + OffsetT num_items; + + /// Grid size in thread blocks + int grid_size; + + /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles + OffsetT block_offset; + + /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles + OffsetT block_end; + + /// Stride between input tiles + OffsetT block_stride; + + + /** + * \brief Constructor. + */ + __host__ __device__ __forceinline__ GridEvenShare() : + total_tiles(0), + big_shares(0), + big_share_items(0), + normal_share_items(0), + normal_base_offset(0), + num_items(0), + grid_size(0), + block_offset(0), + block_end(0), + block_stride(0) + {} + + + /** + * \brief Dispatch initializer. To be called prior prior to kernel launch. + */ + __host__ __device__ __forceinline__ void DispatchInit( + OffsetT num_items, ///< Total number of input items + int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) + int tile_items) ///< Number of data items per input tile + { + this->block_offset = num_items; // Initialize past-the-end + this->block_end = num_items; // Initialize past-the-end + this->num_items = num_items; + this->total_tiles = (num_items + tile_items - 1) / tile_items; + this->grid_size = CUB_MIN(total_tiles, max_grid_size); + OffsetT avg_tiles_per_block = total_tiles / grid_size; + this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); // leftover grains go to big blocks + this->normal_share_items = avg_tiles_per_block * tile_items; + this->normal_base_offset = big_shares * tile_items; + this->big_share_items = normal_share_items + tile_items; + } + + + /** + * \brief Initializes ranges for the specified thread block index. Specialized + * for a "raking" access pattern in which each thread block is assigned a + * consecutive sequence of input tiles. + */ + template + __device__ __forceinline__ void BlockInit( + int block_id, + Int2Type /*strategy_tag*/) + { + block_stride = TILE_ITEMS; + if (block_id < big_shares) + { + // This thread block gets a big share of grains (avg_tiles_per_block + 1) + block_offset = (block_id * big_share_items); + block_end = block_offset + big_share_items; + } + else if (block_id < total_tiles) + { + // This thread block gets a normal share of grains (avg_tiles_per_block) + block_offset = normal_base_offset + (block_id * normal_share_items); + block_end = CUB_MIN(num_items, block_offset + normal_share_items); + } + // Else default past-the-end + } + + + /** + * \brief Block-initialization, specialized for a "raking" access + * pattern in which each thread block is assigned a consecutive sequence + * of input tiles. + */ + template + __device__ __forceinline__ void BlockInit( + int block_id, + Int2Type /*strategy_tag*/) + { + block_stride = grid_size * TILE_ITEMS; + block_offset = (block_id * TILE_ITEMS); + block_end = num_items; + } + + + /** + * \brief Block-initialization, specialized for "strip mining" access + * pattern in which the input tiles assigned to each thread block are + * separated by a stride equal to the the extent of the grid. + */ + template < + int TILE_ITEMS, + GridMappingStrategy STRATEGY> + __device__ __forceinline__ void BlockInit() + { + BlockInit(blockIdx.x, Int2Type()); + } + + + /** + * \brief Block-initialization, specialized for a "raking" access + * pattern in which each thread block is assigned a consecutive sequence + * of input tiles. + */ + template + __device__ __forceinline__ void BlockInit( + OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT block_end) ///< [in] Threadblock end offset (exclusive) + { + this->block_offset = block_offset; + this->block_end = block_end; + this->block_stride = TILE_ITEMS; + } + + +}; + + + + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_mapping.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_mapping.cuh new file mode 100644 index 0000000..6cd8920 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_mapping.cuh @@ -0,0 +1,113 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. + */ + +#pragma once + +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/****************************************************************************** + * Mapping policies + *****************************************************************************/ + + +/** + * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. + */ +enum GridMappingStrategy +{ + /** + * \brief An a "raking" access pattern in which each thread block is + * assigned a consecutive sequence of input tiles + * + * \par Overview + * The input is evenly partitioned into \p p segments, where \p p is + * constant and corresponds loosely to the number of thread blocks that may + * actively reside on the target device. Each segment is comprised of + * consecutive tiles, where a tile is a small, constant-sized unit of input + * to be processed to completion before the thread block terminates or + * obtains more work. The kernel invokes \p p thread blocks, each + * of which iteratively consumes a segment of n/p elements + * in tile-size increments. + */ + GRID_MAPPING_RAKE, + + /** + * \brief An a "strip mining" access pattern in which the input tiles assigned + * to each thread block are separated by a stride equal to the the extent of + * the grid. + * + * \par Overview + * The input is evenly partitioned into \p p sets, where \p p is + * constant and corresponds loosely to the number of thread blocks that may + * actively reside on the target device. Each set is comprised of + * data tiles separated by stride \p tiles, where a tile is a small, + * constant-sized unit of input to be processed to completion before the + * thread block terminates or obtains more work. The kernel invokes \p p + * thread blocks, each of which iteratively consumes a segment of + * n/p elements in tile-size increments. + */ + GRID_MAPPING_STRIP_MINE, + + /** + * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. + * + * \par Overview + * The input is treated as a queue to be dynamically consumed by a grid of + * thread blocks. Work is atomically dequeued in tiles, where a tile is a + * unit of input to be processed to completion before the thread block + * terminates or obtains more work. The grid size \p p is constant, + * loosely corresponding to the number of thread blocks that may actively + * reside on the target device. + */ + GRID_MAPPING_DYNAMIC, +}; + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_queue.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_queue.cuh new file mode 100644 index 0000000..f413c6d --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_queue.cuh @@ -0,0 +1,220 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridQueue is a descriptor utility for dynamic queue management. + */ + +#pragma once + +#include "../util_namespace.cuh" +#include "../util_debug.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridQueue is a descriptor utility for dynamic queue management. + * + * \par Overview + * GridQueue descriptors provides abstractions for "filling" or + * "draining" globally-shared vectors. + * + * \par + * A "filling" GridQueue works by atomically-adding to a zero-initialized counter, + * returning a unique offset for the calling thread to write its items. + * The GridQueue maintains the total "fill-size". The fill counter must be reset + * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that + * will be filling. + * + * \par + * Similarly, a "draining" GridQueue works by works by atomically-incrementing a + * zero-initialized counter, returning a unique offset for the calling thread to + * read its items. Threads can safely drain until the array's logical fill-size is + * exceeded. The drain counter must be reset using GridQueue::ResetDrain or + * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that + * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size + * is simply the number of elements in the array.) + * + * \par + * Iterative work management can be implemented simply with a pair of flip-flopping + * work buffers, each with an associated set of fill and drain GridQueue descriptors. + * + * \tparam OffsetT Signed integer type for global offsets + */ +template +class GridQueue +{ +private: + + /// Counter indices + enum + { + FILL = 0, + DRAIN = 1, + }; + + /// Pair of counters + OffsetT *d_counters; + +public: + + /// Returns the device allocation size in bytes needed to construct a GridQueue instance + __host__ __device__ __forceinline__ + static size_t AllocationSize() + { + return sizeof(OffsetT) * 2; + } + + + /// Constructs an invalid GridQueue descriptor + __host__ __device__ __forceinline__ GridQueue() + : + d_counters(NULL) + {} + + + /// Constructs a GridQueue descriptor around the device storage allocation + __host__ __device__ __forceinline__ GridQueue( + void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize(). + : + d_counters((OffsetT*) d_storage) + {} + + + /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. + __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain( + OffsetT fill_size, + cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + d_counters[FILL] = fill_size; + d_counters[DRAIN] = 0; + return cudaSuccess; +#else + OffsetT counters[2]; + counters[FILL] = fill_size; + counters[DRAIN] = 0; + return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); +#endif + } + + + /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. + __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + d_counters[DRAIN] = 0; + return cudaSuccess; +#else + return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); +#endif + } + + + /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. + __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + d_counters[FILL] = 0; + return cudaSuccess; +#else + return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); +#endif + } + + + /// Returns the fill-size established by the parent or by the previous kernel. + __host__ __device__ __forceinline__ cudaError_t FillSize( + OffsetT &fill_size, + cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + (void)stream; + fill_size = d_counters[FILL]; + return cudaSuccess; +#else + return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); +#endif + } + + + /// Drain \p num_items from the queue. Returns offset from which to read items. To be called from CUDA kernel. + __device__ __forceinline__ OffsetT Drain(OffsetT num_items) + { + return atomicAdd(d_counters + DRAIN, num_items); + } + + + /// Fill \p num_items into the queue. Returns offset from which to write items. To be called from CUDA kernel. + __device__ __forceinline__ OffsetT Fill(OffsetT num_items) + { + return atomicAdd(d_counters + FILL, num_items); + } +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * Reset grid queue (call with 1 block of 1 thread) + */ +template +__global__ void FillAndResetDrainKernel( + GridQueue grid_queue, + OffsetT num_items) +{ + grid_queue.FillAndResetDrain(num_items); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/host/mutex.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/host/mutex.cuh new file mode 100644 index 0000000..0054f1f --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/host/mutex.cuh @@ -0,0 +1,171 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Simple portable mutex + */ + + +#pragma once + +#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) + #include +#else + #if defined(_WIN32) || defined(_WIN64) + #include + + #define WIN32_LEAN_AND_MEAN + #define NOMINMAX + #include + #undef WIN32_LEAN_AND_MEAN + #undef NOMINMAX + + /** + * Compiler read/write barrier + */ + #pragma intrinsic(_ReadWriteBarrier) + + #endif +#endif + +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * Simple portable mutex + * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms) + * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++) + */ +struct Mutex +{ +#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) + + std::mutex mtx; + + void Lock() + { + mtx.lock(); + } + + void Unlock() + { + mtx.unlock(); + } + + void TryLock() + { + mtx.try_lock(); + } + +#else //__cplusplus > 199711L + + #if defined(_MSC_VER) + + // Microsoft VC++ + typedef long Spinlock; + + #else + + // GNU g++ + typedef int Spinlock; + + /** + * Compiler read/write barrier + */ + __forceinline__ void _ReadWriteBarrier() + { + __sync_synchronize(); + } + + /** + * Atomic exchange + */ + __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) + { + // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier + _ReadWriteBarrier(); + return __sync_lock_test_and_set(Target, Value); + } + + /** + * Pause instruction to prevent excess processor bus usage + */ + __forceinline__ void YieldProcessor() + { + } + + #endif // defined(_MSC_VER) + + /// Lock member + volatile Spinlock lock; + + /** + * Constructor + */ + Mutex() : lock(0) {} + + /** + * Return when the specified spinlock has been acquired + */ + __forceinline__ void Lock() + { + while (1) + { + if (!_InterlockedExchange(&lock, 1)) return; + while (lock) YieldProcessor(); + } + } + + + /** + * Release the specified spinlock + */ + __forceinline__ void Unlock() + { + _ReadWriteBarrier(); + lock = 0; + } + +#endif // __cplusplus > 199711L + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/arg_index_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/arg_index_input_iterator.cuh new file mode 100644 index 0000000..d3bce58 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/arg_index_input_iterator.cuh @@ -0,0 +1,259 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#include + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples). + * + * \par Overview + * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT. + * Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose + * \p key field is \p i and whose \p value field is itr[i]. + * - Can be used with any data type. + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions. Wrapped host memory can only be dereferenced on the host, and wrapped + * device memory can only be dereferenced on the device. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto + * dereference an array of doubles + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::ArgIndexInputIterator itr(d_in); + * + * // Within device code: + * typedef typename cub::ArgIndexInputIterator::value_type Tuple; + * Tuple item_offset_pair.key = *itr; + * printf("%f @ %d\n", + * item_offset_pair.value, + * item_offset_pair.key); // 8.0 @ 0 + * + * itr = itr + 6; + * item_offset_pair.key = *itr; + * printf("%f @ %d\n", + * item_offset_pair.value, + * item_offset_pair.key); // 9.0 @ 6 + * + * \endcode + * + * \tparam InputIteratorT The value type of the wrapped input iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam OutputValueT The paired value type of the tuple (Default: value type of input iterator) + */ +template < + typename InputIteratorT, + typename OffsetT = ptrdiff_t, + typename OutputValueT = typename std::iterator_traits::value_type> +class ArgIndexInputIterator +{ +public: + + // Required iterator traits + typedef ArgIndexInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef KeyValuePair value_type; ///< The type of the element the iterator can point to + typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to + typedef value_type reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + InputIteratorT itr; + difference_type offset; + +public: + + /// Constructor + __host__ __device__ __forceinline__ ArgIndexInputIterator( + InputIteratorT itr, ///< Input iterator to wrap + difference_type offset = 0) ///< OffsetT (in items) from \p itr denoting the position of the iterator + : + itr(itr), + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + value_type retval; + retval.value = itr[offset]; + retval.key = offset; + return retval; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(itr, offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(itr, offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + self_type offset = (*this) + n; + return *offset; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((itr == rhs.itr) && (offset == rhs.offset)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((itr != rhs.itr) || (offset != rhs.offset)); + } + + /// Normalize + __host__ __device__ __forceinline__ void normalize() + { + itr += offset; + offset = 0; + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_input_iterator.cuh new file mode 100644 index 0000000..0c0252c --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_input_iterator.cuh @@ -0,0 +1,240 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. + * + * \par Overview + * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native + * device pointer of type ValueType*. \p ValueType references are + * made by reading \p ValueType values through loads modified by \p MODIFIER. + * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", + * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions, but can only be dereferenced within device functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto + * dereference a device array of double using the "ldg" PTX load modifier + * (i.e., load values through texture cache). + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::CacheModifiedInputIterator itr(d_in); + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * \endcode + * + * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + CacheLoadModifier MODIFIER, + typename ValueType, + typename OffsetT = ptrdiff_t> +class CacheModifiedInputIterator +{ +public: + + // Required iterator traits + typedef CacheModifiedInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + + +public: + + /// Wrapped native pointer + ValueType* ptr; + + /// Constructor + template + __host__ __device__ __forceinline__ CacheModifiedInputIterator( + QualifiedValueType* ptr) ///< Native pointer to wrap + : + ptr(const_cast::Type *>(ptr)) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + ptr++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + ptr++; + return *this; + } + + /// Indirection + __device__ __forceinline__ reference operator*() const + { + return ThreadLoad(ptr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(ptr + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + ptr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(ptr - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + ptr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return ptr - other.ptr; + } + + /// Array subscript + template + __device__ __forceinline__ reference operator[](Distance n) const + { + return ThreadLoad(ptr + n); + } + + /// Structure dereference + __device__ __forceinline__ pointer operator->() + { + return &ThreadLoad(ptr); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (ptr == rhs.ptr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (ptr != rhs.ptr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_output_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_output_iterator.cuh new file mode 100644 index 0000000..8dbaafa --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_output_iterator.cuh @@ -0,0 +1,254 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access output wrapper for storing array values using a PTX cache-modifier. + * + * \par Overview + * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native + * device pointer of type ValueType*. \p ValueType references are + * made by writing \p ValueType values through stores modified by \p MODIFIER. + * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB", + * "STORE_CG", "STORE_CS", "STORE_WT", etc.). + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions, but can only be dereferenced within device functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to + * dereference a device array of doubles using the "wt" PTX load modifier + * (i.e., write-through to system memory). + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_out; // e.g., [, , , , , , ] + * + * // Create an iterator wrapper + * cub::CacheModifiedOutputIterator itr(d_out); + * + * // Within device code: + * itr[0] = 8.0; + * itr[1] = 66.0; + * itr[55] = 24.0; + * + * \endcode + * + * \par Usage Considerations + * - Can only be dereferenced within device code + * + * \tparam CacheStoreModifier The cub::CacheStoreModifier to use when accessing data + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + CacheStoreModifier MODIFIER, + typename ValueType, + typename OffsetT = ptrdiff_t> +class CacheModifiedOutputIterator +{ +private: + + // Proxy object + struct Reference + { + ValueType* ptr; + + /// Constructor + __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {} + + /// Assignment + __device__ __forceinline__ ValueType operator =(ValueType val) + { + ThreadStore(ptr, val); + return val; + } + }; + +public: + + // Required iterator traits + typedef CacheModifiedOutputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef void value_type; ///< The type of the element the iterator can point to + typedef void pointer; ///< The type of a pointer to an element the iterator can point to + typedef Reference reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType* ptr; + +public: + + /// Constructor + template + __host__ __device__ __forceinline__ CacheModifiedOutputIterator( + QualifiedValueType* ptr) ///< Native pointer to wrap + : + ptr(const_cast::Type *>(ptr)) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + ptr++; + return retval; + } + + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + ptr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return Reference(ptr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(ptr + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + ptr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(ptr - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + ptr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return ptr - other.ptr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return Reference(ptr + n); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (ptr == rhs.ptr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (ptr != rhs.ptr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/constant_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/constant_input_iterator.cuh new file mode 100644 index 0000000..0b7af47 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/constant_input_iterator.cuh @@ -0,0 +1,235 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input generator for dereferencing a sequence of homogeneous values + * + * \par Overview + * - Read references to a ConstantInputIteratorTiterator always return the supplied constant + * of type \p ValueType. + * - Can be used with any data type. + * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device + * functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p ConstantInputIteratorTto + * dereference a sequence of homogeneous doubles. + * \par + * \code + * #include // or equivalently + * + * cub::ConstantInputIterator itr(5.0); + * + * printf("%f\n", itr[0]); // 5.0 + * printf("%f\n", itr[1]); // 5.0 + * printf("%f\n", itr[2]); // 5.0 + * printf("%f\n", itr[50]); // 5.0 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename ValueType, + typename OffsetT = ptrdiff_t> +class ConstantInputIterator +{ +public: + + // Required iterator traits + typedef ConstantInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType val; + OffsetT offset; +#ifdef _WIN32 + OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) +#endif + +public: + + /// Constructor + __host__ __device__ __forceinline__ ConstantInputIterator( + ValueType val, ///< Starting value for the iterator instance to report + OffsetT offset = 0) ///< Base offset + : + val(val), + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return val; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(val, offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(val, offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const + { + return val; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &val; + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (offset == rhs.offset) && ((val == rhs.val)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (offset != rhs.offset) || (val!= rhs.val); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.val << "," << itr.offset << "]"; + return os; + } + +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/counting_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/counting_input_iterator.cuh new file mode 100644 index 0000000..3b42a00 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/counting_input_iterator.cuh @@ -0,0 +1,228 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + +/** + * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. + * + * \par Overview + * - After initializing a CountingInputIteratorTto a certain integer \p base, read references + * at \p offset will return the value \p base + \p offset. + * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device + * functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CountingInputIteratorTto + * dereference a sequence of incrementing integers. + * \par + * \code + * #include // or equivalently + * + * cub::CountingInputIterator itr(5); + * + * printf("%d\n", itr[0]); // 5 + * printf("%d\n", itr[1]); // 6 + * printf("%d\n", itr[2]); // 7 + * printf("%d\n", itr[50]); // 55 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename ValueType, + typename OffsetT = ptrdiff_t> +class CountingInputIterator +{ +public: + + // Required iterator traits + typedef CountingInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType val; + +public: + + /// Constructor + __host__ __device__ __forceinline__ CountingInputIterator( + const ValueType &val) ///< Starting value for the iterator instance to report + : + val(val) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + val++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + val++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return val; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(val + (ValueType) n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + val += (ValueType) n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(val - (ValueType) n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + val -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return (difference_type) (val - other.val); + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return val + (ValueType) n; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &val; + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (val == rhs.val); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (val != rhs.val); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.val << "]"; + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/discard_output_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/discard_output_iterator.cuh new file mode 100644 index 0000000..1fca08c --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/discard_output_iterator.cuh @@ -0,0 +1,220 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../util_namespace.cuh" +#include "../util_macro.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A discard iterator + */ +template +class DiscardOutputIterator +{ +public: + + // Required iterator traits + typedef DiscardOutputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef void value_type; ///< The type of the element the iterator can point to + typedef void pointer; ///< The type of a pointer to an element the iterator can point to + typedef void reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + OffsetT offset; + +#if defined(_WIN32) || !defined(_WIN64) + // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) + OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; +#endif + +public: + + /// Constructor + __host__ __device__ __forceinline__ DiscardOutputIterator( + OffsetT offset = 0) ///< Base offset + : + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ self_type& operator*() + { + // return self reference, which can be assigned to anything + return *this; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ self_type& operator[](Distance n) + { + // return self reference, which can be assigned to anything + return *this; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return; + } + + /// Assignment to self (no-op) + __host__ __device__ __forceinline__ void operator=(self_type const& other) + { + offset = other.offset; + } + + /// Assignment to anything else (no-op) + template + __host__ __device__ __forceinline__ void operator=(T const&) + {} + + /// Cast to void* operator + __host__ __device__ __forceinline__ operator void*() const { return NULL; } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (offset == rhs.offset); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (offset != rhs.offset); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.offset << "]"; + return os; + } + +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_obj_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_obj_input_iterator.cuh new file mode 100644 index 0000000..6236094 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_obj_input_iterator.cuh @@ -0,0 +1,310 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_debug.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + + +/** + * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects. + * + * \par Overview + * - TexObjInputIteratorTwraps a native device pointer of type ValueType*. References + * to elements are to be loaded through texture cache. + * - Can be used to load any data type from memory through texture cache. + * - Can be manipulated and exchanged within and between host and device + * functions, can only be constructed within host functions, and can only be + * dereferenced within device functions. + * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be + * created by the host thread, but can be used by any descendant kernel. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TexRefInputIteratorTto + * dereference a device array of doubles through texture cache. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * int num_items; // e.g., 7 + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::TexObjInputIterator itr; + * itr.BindTexture(d_in, sizeof(double) * num_items); + * ... + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * ... + * itr.UnbindTexture(); + * + * \endcode + * + * \tparam T The value type of this iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename T, + typename OffsetT = ptrdiff_t> +class TexObjInputIterator +{ +public: + + // Required iterator traits + typedef TexObjInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef T value_type; ///< The type of the element the iterator can point to + typedef T* pointer; ///< The type of a pointer to an element the iterator can point to + typedef T reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + // Largest texture word we can use in device + typedef typename UnitWord::TextureWord TextureWord; + + // Number of texture words per T + enum { + TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) + }; + +private: + + T* ptr; + difference_type tex_offset; + cudaTextureObject_t tex_obj; + +public: + + /// Constructor + __host__ __device__ __forceinline__ TexObjInputIterator() + : + ptr(NULL), + tex_offset(0), + tex_obj(0) + {} + + /// Use this iterator to bind \p ptr with a texture reference + template + cudaError_t BindTexture( + QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment + size_t bytes = size_t(-1), ///< Number of bytes in the range + size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator + { + this->ptr = const_cast::Type *>(ptr); + this->tex_offset = tex_offset; + + cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); + cudaResourceDesc res_desc; + cudaTextureDesc tex_desc; + memset(&res_desc, 0, sizeof(cudaResourceDesc)); + memset(&tex_desc, 0, sizeof(cudaTextureDesc)); + res_desc.resType = cudaResourceTypeLinear; + res_desc.res.linear.devPtr = this->ptr; + res_desc.res.linear.desc = channel_desc; + res_desc.res.linear.sizeInBytes = bytes; + tex_desc.readMode = cudaReadModeElementType; + return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL); + } + + /// Unbind this iterator from its texture reference + cudaError_t UnbindTexture() + { + return cudaDestroyTextureObject(tex_obj); + } + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + tex_offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + tex_offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { +#if (CUB_PTX_ARCH == 0) + // Simply dereference the pointer on the host + return ptr[tex_offset]; +#else + // Move array of uninitialized words, then alias and assign to return value + TextureWord words[TEXTURE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < TEXTURE_MULTIPLE; ++i) + { + words[i] = tex1Dfetch( + tex_obj, + (tex_offset * TEXTURE_MULTIPLE) + i); + } + + // Load from words + return *reinterpret_cast(words); +#endif + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_obj = tex_obj; + retval.tex_offset = tex_offset + n; + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + tex_offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_obj = tex_obj; + retval.tex_offset = tex_offset - n; + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + tex_offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return tex_offset - other.tex_offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + self_type offset = (*this) + n; + return *offset; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj)); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_ref_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_ref_input_iterator.cuh new file mode 100644 index 0000000..da1fd16 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_ref_input_iterator.cuh @@ -0,0 +1,374 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_debug.cuh" +#include "../util_namespace.cuh" + +#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE) // This iterator is compatible with CUDA 5.5 and newer + +#if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Static file-scope Tesla/Fermi-style texture references + *****************************************************************************/ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +// Anonymous namespace +namespace { + +/// Global texture reference specialized by type +template +struct IteratorTexRef +{ + /// And by unique ID + template + struct TexId + { + // Largest texture word we can use in device + typedef typename UnitWord::DeviceWord DeviceWord; + typedef typename UnitWord::TextureWord TextureWord; + + // Number of texture words per T + enum { + DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord), + TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) + }; + + // Texture reference type + typedef texture TexRef; + + // Texture reference + static TexRef ref; + + /// Bind texture + static cudaError_t BindTexture(void *d_in, size_t &offset) + { + if (d_in) + { + cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); + ref.channelDesc = tex_desc; + return (CubDebug(cudaBindTexture(&offset, ref, d_in))); + } + + return cudaSuccess; + } + + /// Unbind texture + static cudaError_t UnbindTexture() + { + return CubDebug(cudaUnbindTexture(ref)); + } + + /// Fetch element + template + static __device__ __forceinline__ T Fetch(Distance tex_offset) + { + DeviceWord temp[DEVICE_MULTIPLE]; + TextureWord *words = reinterpret_cast(temp); + + #pragma unroll + for (int i = 0; i < TEXTURE_MULTIPLE; ++i) + { + words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i); + } + + return reinterpret_cast(temp); + } + }; +}; + +// Texture reference definitions +template +template +typename IteratorTexRef::template TexId::TexRef IteratorTexRef::template TexId::ref = 0; + + +} // Anonymous namespace + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/** + * \addtogroup UtilIterator + * @{ + */ + + + +/** + * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses older Tesla/Fermi-style texture references. + * + * \par Overview + * - TexRefInputIteratorTwraps a native device pointer of type ValueType*. References + * to elements are to be loaded through texture cache. + * - Can be used to load any data type from memory through texture cache. + * - Can be manipulated and exchanged within and between host and device + * functions, can only be constructed within host functions, and can only be + * dereferenced within device functions. + * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture + * reference. Only one TexRefInputIteratorTinstance can be bound at any given time for a + * specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host + * thread, and (4) compilation .o unit. + * - With regard to nested/dynamic parallelism, TexRefInputIteratorTiterators may only be + * created by the host thread and used by a top-level kernel (i.e. the one which is launched + * from the host). + * - Compatible with Thrust API v1.7 or newer. + * - Compatible with CUDA toolkit v5.5 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TexRefInputIteratorTto + * dereference a device array of doubles through texture cache. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * int num_items; // e.g., 7 + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::TexRefInputIterator itr; + * itr.BindTexture(d_in, sizeof(double) * num_items); + * ... + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * ... + * itr.UnbindTexture(); + * + * \endcode + * + * \tparam T The value type of this iterator + * \tparam UNIQUE_ID A globally-unique identifier (within the compilation unit) to name the underlying texture reference + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename T, + int UNIQUE_ID, + typename OffsetT = ptrdiff_t> +class TexRefInputIterator +{ +public: + + // Required iterator traits + typedef TexRefInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef T value_type; ///< The type of the element the iterator can point to + typedef T* pointer; ///< The type of a pointer to an element the iterator can point to + typedef T reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + T* ptr; + difference_type tex_offset; + + // Texture reference wrapper (old Tesla/Fermi-style textures) + typedef typename IteratorTexRef::template TexId TexId; + +public: +/* + /// Constructor + __host__ __device__ __forceinline__ TexRefInputIterator() + : + ptr(NULL), + tex_offset(0) + {} +*/ + /// Use this iterator to bind \p ptr with a texture reference + template + cudaError_t BindTexture( + QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment + size_t bytes = size_t(-1), ///< Number of bytes in the range + size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator + { + this->ptr = const_cast::Type *>(ptr); + size_t offset; + cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset); + this->tex_offset = (difference_type) (offset / sizeof(QualifiedT)); + return retval; + } + + /// Unbind this iterator from its texture reference + cudaError_t UnbindTexture() + { + return TexId::UnbindTexture(); + } + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + tex_offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + tex_offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { +#if (CUB_PTX_ARCH == 0) + // Simply dereference the pointer on the host + return ptr[tex_offset]; +#else + // Use the texture reference + return TexId::Fetch(tex_offset); +#endif + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_offset = tex_offset + n; + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + tex_offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_offset = tex_offset - n; + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + tex_offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return tex_offset - other.tex_offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + self_type offset = (*this) + n; + return *offset; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset)); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + +#endif // CUDA_VERSION diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/transform_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/transform_input_iterator.cuh new file mode 100644 index 0000000..39258a4 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/transform_input_iterator.cuh @@ -0,0 +1,252 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for transforming dereferenced values. + * + * \par Overview + * - TransformInputIteratorTwraps a unary conversion functor of type \p + * ConversionOp and a random-access input iterator of type InputIteratorT, + * using the former to produce references of type \p ValueType from the latter. + * - Can be used with any data type. + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions. Wrapped host memory can only be dereferenced on the host, and wrapped + * device memory can only be dereferenced on the device. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TransformInputIteratorTto + * dereference an array of integers, tripling the values and converting them to doubles. + * \par + * \code + * #include // or equivalently + * + * // Functor for tripling integer values and converting to doubles + * struct TripleDoubler + * { + * __host__ __device__ __forceinline__ + * double operator()(const int &a) const { + * return double(a * 3); + * } + * }; + * + * // Declare, allocate, and initialize a device array + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * TripleDoubler conversion_op; + * + * // Create an iterator wrapper + * cub::TransformInputIterator itr(d_in, conversion_op); + * + * // Within device code: + * printf("%f\n", itr[0]); // 24.0 + * printf("%f\n", itr[1]); // 18.0 + * printf("%f\n", itr[6]); // 27.0 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p ValueType. Must have member ValueType operator()(const InputType &datum). + * \tparam InputIteratorT The type of the wrapped input iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * + */ +template < + typename ValueType, + typename ConversionOp, + typename InputIteratorT, + typename OffsetT = ptrdiff_t> +class TransformInputIterator +{ +public: + + // Required iterator traits + typedef TransformInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ConversionOp conversion_op; + InputIteratorT input_itr; + +public: + + /// Constructor + __host__ __device__ __forceinline__ TransformInputIterator( + InputIteratorT input_itr, ///< Input iterator to wrap + ConversionOp conversion_op) ///< Conversion functor to wrap + : + conversion_op(conversion_op), + input_itr(input_itr) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + input_itr++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + input_itr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return conversion_op(*input_itr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(input_itr + n, conversion_op); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + input_itr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(input_itr - n, conversion_op); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + input_itr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return input_itr - other.input_itr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return conversion_op(input_itr[n]); + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &conversion_op(*input_itr); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (input_itr == rhs.input_itr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (input_itr != rhs.input_itr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_load.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_load.cuh new file mode 100644 index 0000000..9de4bd4 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_load.cuh @@ -0,0 +1,438 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for reading memory using PTX cache modifiers. + */ + +#pragma once + +#include + +#include + +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + +//----------------------------------------------------------------------------- +// Tags and constants +//----------------------------------------------------------------------------- + +/** + * \brief Enumeration of cache modifiers for memory load operations. + */ +enum CacheLoadModifier +{ + LOAD_DEFAULT, ///< Default (no modifier) + LOAD_CA, ///< Cache at all levels + LOAD_CG, ///< Cache at global level + LOAD_CS, ///< Cache streaming (likely to be accessed once) + LOAD_CV, ///< Cache as volatile (including cached system lines) + LOAD_LDG, ///< Cache as texture + LOAD_VOLATILE, ///< Volatile (any memory space) +}; + + +/** + * \name Thread I/O (cache modified) + * @{ + */ + +/** + * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. + * + * \par Example + * \code + * #include // or equivalently + * + * // 32-bit load using cache-global modifier: + * int *d_in; + * int val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 16-bit load using default modifier + * short *d_in; + * short val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 256-bit load using cache-volatile modifier + * double4 *d_in; + * double4 val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 96-bit load using cache-streaming modifier + * struct TestFoo { bool a; short b; }; + * TestFoo *d_struct; + * TestFoo val = cub::ThreadLoad(d_in + threadIdx.x); + * \endcode + * + * \tparam MODIFIER [inferred] CacheLoadModifier enumeration + * \tparam InputIteratorT [inferred] Input iterator type \iterator + */ +template < + CacheLoadModifier MODIFIER, + typename InputIteratorT> +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr); + + +//@} end member group + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/// Helper structure for templated load iteration (inductive case) +template +struct IterateThreadLoad +{ + template + static __device__ __forceinline__ void Load(T const *ptr, T *vals) + { + vals[COUNT] = ThreadLoad(ptr + COUNT); + IterateThreadLoad::template Load(ptr, vals); + } + + template + static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals) + { + vals[COUNT] = itr[COUNT]; + IterateThreadLoad::Dereference(itr, vals); + } +}; + + +/// Helper structure for templated load iteration (termination case) +template +struct IterateThreadLoad +{ + template + static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {} + + template + static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {} +}; + + +/** + * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ uint4 ThreadLoad(uint4 const *ptr) \ + { \ + uint4 retval; \ + asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ + "=r"(retval.x), \ + "=r"(retval.y), \ + "=r"(retval.z), \ + "=r"(retval.w) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ ulonglong2 ThreadLoad(ulonglong2 const *ptr) \ + { \ + ulonglong2 retval; \ + asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ + "=l"(retval.x), \ + "=l"(retval.y) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + +/** + * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ ushort4 ThreadLoad(ushort4 const *ptr) \ + { \ + ushort4 retval; \ + asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ + "=h"(retval.x), \ + "=h"(retval.y), \ + "=h"(retval.z), \ + "=h"(retval.w) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ uint2 ThreadLoad(uint2 const *ptr) \ + { \ + uint2 retval; \ + asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ + "=r"(retval.x), \ + "=r"(retval.y) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ unsigned long long ThreadLoad(unsigned long long const *ptr) \ + { \ + unsigned long long retval; \ + asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ + "=l"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + +/** + * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned int ThreadLoad(unsigned int const *ptr) \ + { \ + unsigned int retval; \ + asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ + "=r"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + + +/** + * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned short ThreadLoad(unsigned short const *ptr) \ + { \ + unsigned short retval; \ + asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ + "=h"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + + +/** + * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier + */ +#define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned char ThreadLoad(unsigned char const *ptr) \ + { \ + unsigned short retval; \ + asm volatile ( \ + "{" \ + " .reg .u8 datum;" \ + " ld."#ptx_modifier".u8 datum, [%1];" \ + " cvt.u16.u8 %0, datum;" \ + "}" : \ + "=h"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return (unsigned char) retval; \ + } + + +/** + * Define powers-of-two ThreadLoad specializations for the given Cache load modifier + */ +#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ + _CUB_LOAD_16(cub_modifier, ptx_modifier) \ + _CUB_LOAD_8(cub_modifier, ptx_modifier) \ + _CUB_LOAD_4(cub_modifier, ptx_modifier) \ + _CUB_LOAD_2(cub_modifier, ptx_modifier) \ + _CUB_LOAD_1(cub_modifier, ptx_modifier) \ + + +/** + * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers + */ +#if CUB_PTX_ARCH >= 200 + _CUB_LOAD_ALL(LOAD_CA, ca) + _CUB_LOAD_ALL(LOAD_CG, cg) + _CUB_LOAD_ALL(LOAD_CS, cs) + _CUB_LOAD_ALL(LOAD_CV, cv) +#else + _CUB_LOAD_ALL(LOAD_CA, global) + // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 + _CUB_LOAD_ALL(LOAD_CG, volatile.global) + _CUB_LOAD_ALL(LOAD_CS, global) + _CUB_LOAD_ALL(LOAD_CV, volatile.global) +#endif + +#if CUB_PTX_ARCH >= 350 + _CUB_LOAD_ALL(LOAD_LDG, global.nc) +#else + _CUB_LOAD_ALL(LOAD_LDG, global) +#endif + + +// Macro cleanup +#undef _CUB_LOAD_ALL +#undef _CUB_LOAD_1 +#undef _CUB_LOAD_2 +#undef _CUB_LOAD_4 +#undef _CUB_LOAD_8 +#undef _CUB_LOAD_16 + + + +/** + * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types + */ +template +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad( + InputIteratorT itr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + return *itr; +} + + +/** + * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + return *ptr; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types + */ +template +__device__ __forceinline__ T ThreadLoadVolatilePointer( + T *ptr, + Int2Type /*is_primitive*/) +{ + T retval = *reinterpret_cast(ptr); + return retval; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types + */ +template +__device__ __forceinline__ T ThreadLoadVolatilePointer( + T *ptr, + Int2Type /*is_primitive*/) +{ + typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying + + const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); +/* + VolatileWord words[VOLATILE_MULTIPLE]; + + IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( + reinterpret_cast(ptr), + words); + + return *reinterpret_cast(words); +*/ + + T retval; + VolatileWord *words = reinterpret_cast(&retval); + IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( + reinterpret_cast(ptr), + words); + return retval; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + // Apply tags for partial-specialization + return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); +} + + +/** + * ThreadLoad definition for generic modifiers on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T const *ptr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + typedef typename UnitWord::DeviceWord DeviceWord; + + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + + DeviceWord words[DEVICE_MULTIPLE]; + + IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load( + reinterpret_cast(const_cast(ptr)), + words); + + return *reinterpret_cast(words); +} + + +/** + * ThreadLoad definition for generic modifiers + */ +template < + CacheLoadModifier MODIFIER, + typename InputIteratorT> +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr) +{ + // Apply tags for partial-specialization + return ThreadLoad( + itr, + Int2Type(), + Int2Type::VALUE>()); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilIo + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_operators.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_operators.cuh new file mode 100644 index 0000000..2bd5403 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_operators.cuh @@ -0,0 +1,317 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Simple binary operator functor types + */ + +/****************************************************************************** + * Simple functor operators + ******************************************************************************/ + +#pragma once + +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \brief Default equality functor + */ +struct Equality +{ + /// Boolean equality operator, returns (a == b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return a == b; + } +}; + + +/** + * \brief Default inequality functor + */ +struct Inequality +{ + /// Boolean inequality operator, returns (a != b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return a != b; + } +}; + + +/** + * \brief Inequality functor (wraps equality functor) + */ +template +struct InequalityWrapper +{ + /// Wrapped equality operator + EqualityOp op; + + /// Constructor + __host__ __device__ __forceinline__ + InequalityWrapper(EqualityOp op) : op(op) {} + + /// Boolean inequality operator, returns (a != b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) + { + return !op(a, b); + } +}; + + +/** + * \brief Default sum functor + */ +struct Sum +{ + /// Boolean sum operator, returns a + b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return a + b; + } +}; + + +/** + * \brief Default max functor + */ +struct Max +{ + /// Boolean max operator, returns (a > b) ? a : b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return CUB_MAX(a, b); + } +}; + + +/** + * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item) + */ +struct ArgMax +{ + /// Boolean max operator, preferring the item having the smaller offset in case of ties + template + __host__ __device__ __forceinline__ KeyValuePair operator()( + const KeyValuePair &a, + const KeyValuePair &b) const + { +// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) +// return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; + + if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) + return b; + return a; + } +}; + + +/** + * \brief Default min functor + */ +struct Min +{ + /// Boolean min operator, returns (a < b) ? a : b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return CUB_MIN(a, b); + } +}; + + +/** + * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item) + */ +struct ArgMin +{ + /// Boolean min operator, preferring the item having the smaller offset in case of ties + template + __host__ __device__ __forceinline__ KeyValuePair operator()( + const KeyValuePair &a, + const KeyValuePair &b) const + { +// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) +// return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; + + if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) + return b; + return a; + } +}; + + +/** + * \brief Default cast functor + */ +template +struct CastOp +{ + /// Cast operator, returns (B) a + template + __host__ __device__ __forceinline__ B operator()(const A &a) const + { + return (B) a; + } +}; + + +/** + * \brief Binary operator wrapper for switching non-commutative scan arguments + */ +template +class SwizzleScanOp +{ +private: + + /// Wrapped scan operator + ScanOp scan_op; + +public: + + /// Constructor + __host__ __device__ __forceinline__ + SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} + + /// Switch the scan arguments + template + __host__ __device__ __forceinline__ + T operator()(const T &a, const T &b) + { + T _a(a); + T _b(b); + + return scan_op(_b, _a); + } +}; + + +/** + * \brief Reduce-by-segment functor. + * + * Given two cub::KeyValuePair inputs \p a and \p b and a + * binary associative combining operator \p f(const T &x, const T &y), + * an instance of this functor returns a cub::KeyValuePair whose \p key + * field is a.key + b.key, and whose \p value field + * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise. + * + * ReduceBySegmentOp is an associative, non-commutative binary combining operator + * for input sequences of cub::KeyValuePair pairings. Such + * sequences are typically used to represent a segmented set of values to be reduced + * and a corresponding set of {0,1}-valued integer "head flags" demarcating the + * first value of each segment. + * + */ +template ///< Binary reduction operator to apply to values +struct ReduceBySegmentOp +{ + /// Wrapped reduction operator + ReductionOpT op; + + /// Constructor + __host__ __device__ __forceinline__ ReduceBySegmentOp() {} + + /// Constructor + __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {} + + /// Scan operator + template ///< KeyValuePair pairing of T (value) and OffsetT (head flag) + __host__ __device__ __forceinline__ KeyValuePairT operator()( + const KeyValuePairT &first, ///< First partial reduction + const KeyValuePairT &second) ///< Second partial reduction + { + KeyValuePairT retval; + retval.key = first.key + second.key; + retval.value = (second.key) ? + second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate + op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate + return retval; + } +}; + + + +template ///< Binary reduction operator to apply to values +struct ReduceByKeyOp +{ + /// Wrapped reduction operator + ReductionOpT op; + + /// Constructor + __host__ __device__ __forceinline__ ReduceByKeyOp() {} + + /// Constructor + __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {} + + /// Scan operator + template + __host__ __device__ __forceinline__ KeyValuePairT operator()( + const KeyValuePairT &first, ///< First partial reduction + const KeyValuePairT &second) ///< Second partial reduction + { + KeyValuePairT retval = second; + + if (first.key == second.key) + retval.value = op(first.value, retval.value); + + return retval; + } +}; + + + + + + + +/** @} */ // end group UtilModule + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_reduce.cuh new file mode 100644 index 0000000..9e27705 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_reduce.cuh @@ -0,0 +1,152 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential reduction over statically-sized array types + */ + +#pragma once + +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) +namespace internal { + +/** + * Sequential reduction over statically-sized array types + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix, ///< [in] Prefix to seed reduction with + Int2Type /*length*/) +{ + T retval = prefix; + + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + retval = reduction_op(retval, input[i]); + + return retval; +} + + +/** + * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH LengthT of input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix) ///< [in] Prefix to seed reduction with +{ + return ThreadReduce(input, reduction_op, prefix, Int2Type()); +} + + +/** + * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. + * + * \tparam LENGTH LengthT of input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op) ///< [in] Binary reduction operator +{ + T prefix = input[0]; + return ThreadReduce(input + 1, reduction_op, prefix); +} + + +/** + * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T (&input)[LENGTH], ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix) ///< [in] Prefix to seed reduction with +{ + return ThreadReduce(input, reduction_op, prefix, Int2Type()); +} + + +/** + * \brief Serial reduction with the specified operator + * + * \tparam LENGTH [inferred] LengthT of \p input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T (&input)[LENGTH], ///< [in] Input array + ReductionOp reduction_op) ///< [in] Binary reduction operator +{ + return ThreadReduce((T*) input, reduction_op); +} + + +} // internal namespace +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_scan.cuh new file mode 100644 index 0000000..545b414 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_scan.cuh @@ -0,0 +1,268 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential prefix scan over statically-sized array types + */ + +#pragma once + +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) +namespace internal { + + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \name Sequential prefix scan over statically-sized array types + * @{ + */ + +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T inclusive, + T exclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type /*length*/) +{ + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + { + inclusive = scan_op(exclusive, input[i]); + output[i] = exclusive; + exclusive = inclusive; + } + + return inclusive; +} + + + +/** + * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.) +{ + T inclusive = input[0]; + if (apply_prefix) + { + inclusive = scan_op(prefix, inclusive); + } + output[0] = prefix; + T exclusive = inclusive; + + return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + return ThreadScanExclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); +} + + + + + + + + + +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T inclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type /*length*/) +{ + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + { + inclusive = scan_op(inclusive, input[i]); + output[i] = inclusive; + } + + return inclusive; +} + + +/** + * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned. + * + * \tparam LENGTH LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator +{ + T inclusive = input[0]; + output[0] = inclusive; + + // Continue scan + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator +{ + return ThreadScanInclusive((T*) input, (T*) output, scan_op); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + T inclusive = input[0]; + if (apply_prefix) + { + inclusive = scan_op(prefix, inclusive); + } + output[0] = inclusive; + + // Continue scan + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + return ThreadScanInclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); +} + + +//@} end member group + +/** @} */ // end group UtilModule + + +} // internal namespace +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_search.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_search.cuh new file mode 100644 index 0000000..379a08a --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_search.cuh @@ -0,0 +1,154 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential search + */ + +#pragma once + +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * Computes the begin offsets into A and B for the specific diagonal + */ +template < + typename AIteratorT, + typename BIteratorT, + typename OffsetT, + typename CoordinateT> +__host__ __device__ __forceinline__ void MergePathSearch( + OffsetT diagonal, + AIteratorT a, + BIteratorT b, + OffsetT a_len, + OffsetT b_len, + CoordinateT& path_coordinate) +{ + /// The value type of the input iterator + typedef typename std::iterator_traits::value_type T; + + OffsetT split_min = CUB_MAX(diagonal - b_len, 0); + OffsetT split_max = CUB_MIN(diagonal, a_len); + + while (split_min < split_max) + { + OffsetT split_pivot = (split_min + split_max) >> 1; + if (a[split_pivot] <= b[diagonal - split_pivot - 1]) + { + // Move candidate split range up A, down B + split_min = split_pivot + 1; + } + else + { + // Move candidate split range up B, down A + split_max = split_pivot; + } + } + + path_coordinate.x = CUB_MIN(split_min, a_len); + path_coordinate.y = diagonal - split_min; +} + + + +/** + * \brief Returns the offset of the first value within \p input which does not compare less than \p val + */ +template < + typename InputIteratorT, + typename OffsetT, + typename T> +__device__ __forceinline__ OffsetT LowerBound( + InputIteratorT input, ///< [in] Input sequence + OffsetT num_items, ///< [in] Input sequence length + T val) ///< [in] Search key +{ + OffsetT retval = 0; + while (num_items > 0) + { + OffsetT half = num_items >> 1; + if (input[retval + half] < val) + { + retval = retval + (half + 1); + num_items = num_items - (half + 1); + } + else + { + num_items = half; + } + } + + return retval; +} + + +/** + * \brief Returns the offset of the first value within \p input which compares greater than \p val + */ +template < + typename InputIteratorT, + typename OffsetT, + typename T> +__device__ __forceinline__ OffsetT UpperBound( + InputIteratorT input, ///< [in] Input sequence + OffsetT num_items, ///< [in] Input sequence length + T val) ///< [in] Search key +{ + OffsetT retval = 0; + while (num_items > 0) + { + OffsetT half = num_items >> 1; + if (val < input[retval + half]) + { + num_items = half; + } + else + { + retval = retval + (half + 1); + num_items = num_items - (half + 1); + } + } + + return retval; +} + + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_store.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_store.cuh new file mode 100644 index 0000000..14ee84d --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_store.cuh @@ -0,0 +1,422 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for writing memory using PTX cache modifiers. + */ + +#pragma once + +#include + +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +//----------------------------------------------------------------------------- +// Tags and constants +//----------------------------------------------------------------------------- + +/** + * \brief Enumeration of cache modifiers for memory store operations. + */ +enum CacheStoreModifier +{ + STORE_DEFAULT, ///< Default (no modifier) + STORE_WB, ///< Cache write-back all coherent levels + STORE_CG, ///< Cache at global level + STORE_CS, ///< Cache streaming (likely to be accessed once) + STORE_WT, ///< Cache write-through (to system memory) + STORE_VOLATILE, ///< Volatile shared (any memory space) +}; + + +/** + * \name Thread I/O (cache modified) + * @{ + */ + +/** + * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. + * + * \par Example + * \code + * #include // or equivalently + * + * // 32-bit store using cache-global modifier: + * int *d_out; + * int val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 16-bit store using default modifier + * short *d_out; + * short val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 256-bit store using write-through modifier + * double4 *d_out; + * double4 val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 96-bit store using cache-streaming cache modifier + * struct TestFoo { bool a; short b; }; + * TestFoo *d_struct; + * TestFoo val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * \endcode + * + * \tparam MODIFIER [inferred] CacheStoreModifier enumeration + * \tparam InputIteratorT [inferred] Output iterator type \iterator + * \tparam T [inferred] Data type of output value + */ +template < + CacheStoreModifier MODIFIER, + typename OutputIteratorT, + typename T> +__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val); + + +//@} end member group + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/// Helper structure for templated store iteration (inductive case) +template +struct IterateThreadStore +{ + template + static __device__ __forceinline__ void Store(T *ptr, T *vals) + { + ThreadStore(ptr + COUNT, vals[COUNT]); + IterateThreadStore::template Store(ptr, vals); + } + + template + static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals) + { + ptr[COUNT] = vals[COUNT]; + IterateThreadStore::Dereference(ptr, vals); + } + +}; + +/// Helper structure for templated store iteration (termination case) +template +struct IterateThreadStore +{ + template + static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {} + + template + static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {} +}; + + +/** + * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_16(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(uint4* ptr, uint4 val) \ + { \ + asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val.x), \ + "r"(val.y), \ + "r"(val.z), \ + "r"(val.w)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(ulonglong2* ptr, ulonglong2 val) \ + { \ + asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \ + _CUB_ASM_PTR_(ptr), \ + "l"(val.x), \ + "l"(val.y)); \ + } + + +/** + * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_8(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(ushort4* ptr, ushort4 val) \ + { \ + asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"(val.x), \ + "h"(val.y), \ + "h"(val.z), \ + "h"(val.w)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(uint2* ptr, uint2 val) \ + { \ + asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val.x), \ + "r"(val.y)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned long long* ptr, unsigned long long val) \ + { \ + asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "l"(val)); \ + } + +/** + * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_4(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned int* ptr, unsigned int val) \ + { \ + asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val)); \ + } + + +/** + * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_2(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned short* ptr, unsigned short val) \ + { \ + asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"(val)); \ + } + + +/** + * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier + */ +#define _CUB_STORE_1(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned char* ptr, unsigned char val) \ + { \ + asm volatile ( \ + "{" \ + " .reg .u8 datum;" \ + " cvt.u8.u16 datum, %1;" \ + " st."#ptx_modifier".u8 [%0], datum;" \ + "}" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"((unsigned short) val)); \ + } + +/** + * Define powers-of-two ThreadStore specializations for the given Cache load modifier + */ +#define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \ + _CUB_STORE_16(cub_modifier, ptx_modifier) \ + _CUB_STORE_8(cub_modifier, ptx_modifier) \ + _CUB_STORE_4(cub_modifier, ptx_modifier) \ + _CUB_STORE_2(cub_modifier, ptx_modifier) \ + _CUB_STORE_1(cub_modifier, ptx_modifier) \ + + +/** + * Define ThreadStore specializations for the various Cache load modifiers + */ +#if CUB_PTX_ARCH >= 200 + _CUB_STORE_ALL(STORE_WB, wb) + _CUB_STORE_ALL(STORE_CG, cg) + _CUB_STORE_ALL(STORE_CS, cs) + _CUB_STORE_ALL(STORE_WT, wt) +#else + _CUB_STORE_ALL(STORE_WB, global) + _CUB_STORE_ALL(STORE_CG, global) + _CUB_STORE_ALL(STORE_CS, global) + _CUB_STORE_ALL(STORE_WT, volatile.global) +#endif + + +// Macro cleanup +#undef _CUB_STORE_ALL +#undef _CUB_STORE_1 +#undef _CUB_STORE_2 +#undef _CUB_STORE_4 +#undef _CUB_STORE_8 +#undef _CUB_STORE_16 + + +/** + * ThreadStore definition for STORE_DEFAULT modifier on iterator types + */ +template +__device__ __forceinline__ void ThreadStore( + OutputIteratorT itr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + *itr = val; +} + + +/** + * ThreadStore definition for STORE_DEFAULT modifier on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + *ptr = val; +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types + */ +template +__device__ __forceinline__ void ThreadStoreVolatilePtr( + T *ptr, + T val, + Int2Type /*is_primitive*/) +{ + *reinterpret_cast(ptr) = val; +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types + */ +template +__device__ __forceinline__ void ThreadStoreVolatilePtr( + T *ptr, + T val, + Int2Type /*is_primitive*/) +{ + // Create a temporary using shuffle-words, then store using volatile-words + typedef typename UnitWord::VolatileWord VolatileWord; + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); + const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); + + VolatileWord words[VOLATILE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) + reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; + + IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference( + reinterpret_cast(ptr), + words); +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + ThreadStoreVolatilePtr(ptr, val, Int2Type::PRIMITIVE>()); +} + + +/** + * ThreadStore definition for generic modifiers on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) +{ + // Create a temporary using shuffle-words, then store using device-words + typedef typename UnitWord::DeviceWord DeviceWord; + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); + + DeviceWord words[DEVICE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) + reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; + + IterateThreadStore<0, DEVICE_MULTIPLE>::template Store( + reinterpret_cast(ptr), + words); +} + + +/** + * ThreadStore definition for generic modifiers + */ +template +__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val) +{ + ThreadStore( + itr, + val, + Int2Type(), + Int2Type::VALUE>()); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilIo + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_allocator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_allocator.cuh new file mode 100644 index 0000000..24c7a79 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_allocator.cuh @@ -0,0 +1,708 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Simple caching allocator for device memory allocations. The allocator is + * thread-safe and capable of managing device allocations on multiple devices. + ******************************************************************************/ + +#pragma once + +#include "util_namespace.cuh" +#include "util_debug.cuh" + +#include +#include + +#include "host/mutex.cuh" +#include + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/****************************************************************************** + * CachingDeviceAllocator (host use) + ******************************************************************************/ + +/** + * \brief A simple caching allocator for device memory allocations. + * + * \par Overview + * The allocator is thread-safe and stream-safe and is capable of managing cached + * device allocations on multiple devices. It behaves as follows: + * + * \par + * - Allocations from the allocator are associated with an \p active_stream. Once freed, + * the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for + * reuse within other streams when all prior work submitted to \p active_stream has completed. + * - Allocations are categorized and cached by bin size. A new allocation request of + * a given size will only consider cached allocations within the corresponding bin. + * - Bin limits progress geometrically in accordance with the growth factor + * \p bin_growth provided during construction. Unused device allocations within + * a larger bin cache are not reused for allocation requests that categorize to + * smaller bin sizes. + * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to + * (\p bin_growth ^ \p min_bin). + * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest + * bin and are simply freed when they are deallocated instead of being returned + * to a bin-cache. + * - %If the total storage of cached allocations on a given device will exceed + * \p max_cached_bytes, allocations for that device are simply freed when they are + * deallocated instead of being returned to their bin-cache. + * + * \par + * For example, the default-constructed CachingDeviceAllocator is configured with: + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = 6MB - 1B + * + * \par + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB + * and sets a maximum of 6,291,455 cached bytes per device + * + */ +struct CachingDeviceAllocator +{ + + //--------------------------------------------------------------------- + // Constants + //--------------------------------------------------------------------- + + /// Out-of-bounds bin + static const unsigned int INVALID_BIN = (unsigned int) -1; + + /// Invalid size + static const size_t INVALID_SIZE = (size_t) -1; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Invalid device ordinal + static const int INVALID_DEVICE_ORDINAL = -1; + + //--------------------------------------------------------------------- + // Type definitions and helper types + //--------------------------------------------------------------------- + + /** + * Descriptor for device memory allocations + */ + struct BlockDescriptor + { + void* d_ptr; // Device pointer + size_t bytes; // Size of allocation in bytes + unsigned int bin; // Bin enumeration + int device; // device ordinal + cudaStream_t associated_stream; // Associated associated_stream + cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed + + // Constructor (suitable for searching maps for a specific block, given its pointer and device) + BlockDescriptor(void *d_ptr, int device) : + d_ptr(d_ptr), + bytes(0), + bin(INVALID_BIN), + device(device), + associated_stream(0), + ready_event(0) + {} + + // Constructor (suitable for searching maps for a range of suitable blocks, given a device) + BlockDescriptor(int device) : + d_ptr(NULL), + bytes(0), + bin(INVALID_BIN), + device(device), + associated_stream(0), + ready_event(0) + {} + + // Comparison functor for comparing device pointers + static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + if (a.device == b.device) + return (a.d_ptr < b.d_ptr); + else + return (a.device < b.device); + } + + // Comparison functor for comparing allocation sizes + static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + if (a.device == b.device) + return (a.bytes < b.bytes); + else + return (a.device < b.device); + } + }; + + /// BlockDescriptor comparator function interface + typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); + + class TotalBytes { + public: + size_t free; + size_t live; + TotalBytes() { free = live = 0; } + }; + + /// Set type for cached blocks (ordered by size) + typedef std::multiset CachedBlocks; + + /// Set type for live blocks (ordered by ptr) + typedef std::multiset BusyBlocks; + + /// Map type of device ordinals to the number of cached bytes cached by each device + typedef std::map GpuCachedBytes; + + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + /** + * Integer pow function for unsigned base and exponent + */ + static unsigned int IntPow( + unsigned int base, + unsigned int exp) + { + unsigned int retval = 1; + while (exp > 0) + { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + + /** + * Round up to the nearest power-of + */ + void NearestPowerOf( + unsigned int &power, + size_t &rounded_bytes, + unsigned int base, + size_t value) + { + power = 0; + rounded_bytes = 1; + + if (value * base < value) + { + // Overflow + power = sizeof(size_t) * 8; + rounded_bytes = size_t(0) - 1; + return; + } + + while (rounded_bytes < value) + { + rounded_bytes *= base; + power++; + } + } + + + //--------------------------------------------------------------------- + // Fields + //--------------------------------------------------------------------- + + cub::Mutex mutex; /// Mutex for thread-safety + + unsigned int bin_growth; /// Geometric growth factor for bin-sizes + unsigned int min_bin; /// Minimum bin enumeration + unsigned int max_bin; /// Maximum bin enumeration + + size_t min_bin_bytes; /// Minimum bin size + size_t max_bin_bytes; /// Maximum bin size + size_t max_cached_bytes; /// Maximum aggregate cached bytes per device + + const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) + bool debug; /// Whether or not to print (de)allocation events to stdout + + GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device + CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse + BusyBlocks live_blocks; /// Set of live device allocations currently in use + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //--------------------------------------------------------------------- + // Methods + //--------------------------------------------------------------------- + + /** + * \brief Constructor. + */ + CachingDeviceAllocator( + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) + unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) + size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) + bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) + bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) + : + bin_growth(bin_growth), + min_bin(min_bin), + max_bin(max_bin), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes(max_cached_bytes), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Default constructor. + * + * Configured with: + * \par + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and + * sets a maximum of 6,291,455 cached bytes per device + */ + CachingDeviceAllocator( + bool skip_cleanup = false, + bool debug = false) + : + bin_growth(8), + min_bin(3), + max_bin(7), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes((max_bin_bytes * 3) - 1), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) + {} + + + /** + * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. + * + * Changing the ceiling of cached bytes does not cause any allocations (in-use or + * cached-in-reserve) to be freed. See \p FreeAllCached(). + */ + cudaError_t SetMaxCachedBytes( + size_t max_cached_bytes) + { + // Lock + mutex.Lock(); + + if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes); + + this->max_cached_bytes = max_cached_bytes; + + // Unlock + mutex.Unlock(); + + return cudaSuccess; + } + + + /** + * \brief Provides a suitable allocation of device memory for the given size on the specified device. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceAllocate( + int device, ///< [in] Device on which to place the allocation + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation + { + *d_ptr = NULL; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + if (device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + device = entrypoint_device; + } + + // Create a block descriptor for the requested allocation + bool found = false; + BlockDescriptor search_key(device); + search_key.associated_stream = active_stream; + NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); + + if (search_key.bin > max_bin) + { + // Bin is greater than our maximum bin: allocate the request + // exactly and give out-of-bounds bin. It will not be cached + // for reuse when returned. + search_key.bin = INVALID_BIN; + search_key.bytes = bytes; + } + else + { + // Search for a suitable cached allocation: lock + mutex.Lock(); + + if (search_key.bin < min_bin) + { + // Bin is less than minimum bin: round up + search_key.bin = min_bin; + search_key.bytes = min_bin_bytes; + } + + // Iterate through the range of cached blocks on the same device in the same bin + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); + while ((block_itr != cached_blocks.end()) + && (block_itr->device == device) + && (block_itr->bin == search_key.bin)) + { + // To prevent races with reusing blocks returned by the host but still + // in use by the device, only consider cached blocks that are + // either (from the active stream) or (from an idle stream) + if ((active_stream == block_itr->associated_stream) || + (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) + { + // Reuse existing cache block. Insert into live blocks. + found = true; + search_key = *block_itr; + search_key.associated_stream = active_stream; + live_blocks.insert(search_key); + + // Remove from free blocks + cached_bytes[device].free -= search_key.bytes; + cached_bytes[device].live += search_key.bytes; + + if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", + device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); + + cached_blocks.erase(block_itr); + + break; + } + block_itr++; + } + + // Done searching: unlock + mutex.Unlock(); + } + + // Allocate the block if necessary + if (!found) + { + // Set runtime's current device to specified device (entrypoint may not be set) + if (device != entrypoint_device) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + if (CubDebug(error = cudaSetDevice(device))) return error; + } + + // Attempt to allocate + if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) + { + // The allocation attempt failed: free all cached blocks on device and retry + if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", + device, (long long) search_key.bytes, (long long) search_key.associated_stream); + + error = cudaSuccess; // Reset the error we will return + cudaGetLastError(); // Reset CUDART's error + + // Lock + mutex.Lock(); + + // Iterate the range of free blocks on the same device + BlockDescriptor free_key(device); + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); + + while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) + { + // No need to worry about synchronization with the device: cudaFree is + // blocking and will synchronize across all kernels executing + // on the current device + + // Free device memory and destroy stream event. + if (CubDebug(error = cudaFree(block_itr->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; + + // Reduce balance and erase entry + cached_bytes[device].free -= block_itr->bytes; + + if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + + cached_blocks.erase(block_itr); + + block_itr++; + } + + // Unlock + mutex.Unlock(); + + // Return under error + if (error) return error; + + // Try to allocate again + if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error; + } + + // Create ready event + if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) + return error; + + // Insert into live blocks + mutex.Lock(); + live_blocks.insert(search_key); + cached_bytes[device].live += search_key.bytes; + mutex.Unlock(); + + if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", + device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); + + // Attempt to revert back to previous device if necessary + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + } + + // Copy device pointer to output parameter + *d_ptr = search_key.d_ptr; + + if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + + return error; + } + + + /** + * \brief Provides a suitable allocation of device memory for the given size on the current device. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceAllocate( + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation + { + return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); + } + + + /** + * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceFree( + int device, + void* d_ptr) + { + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + if (device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + return error; + device = entrypoint_device; + } + + // Lock + mutex.Lock(); + + // Find corresponding block descriptor + bool recached = false; + BlockDescriptor search_key(d_ptr, device); + BusyBlocks::iterator block_itr = live_blocks.find(search_key); + if (block_itr != live_blocks.end()) + { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + cached_bytes[device].live -= search_key.bytes; + + // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold + if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) + { + // Insert returned allocation into free blocks + recached = true; + cached_blocks.insert(search_key); + cached_bytes[device].free += search_key.bytes; + + if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), + (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + } + } + + // Unlock + mutex.Unlock(); + + // First set to specified device (entrypoint may not be set) + if (device != entrypoint_device) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + if (CubDebug(error = cudaSetDevice(device))) return error; + } + + if (recached) + { + // Insert the ready event in the associated stream (must have current device set properly) + if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; + } + else + { + // Free the allocation from the runtime and cleanup the event. + if (CubDebug(error = cudaFree(d_ptr))) return error; + if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; + + if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + } + + // Reset device + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + } + + + /** + * \brief Frees a live allocation of device memory on the current device, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. + */ + cudaError_t DeviceFree( + void* d_ptr) + { + return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); + } + + + /** + * \brief Frees all cached device allocations on all devices + */ + cudaError_t FreeAllCached() + { + cudaError_t error = cudaSuccess; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + int current_device = INVALID_DEVICE_ORDINAL; + + mutex.Lock(); + + while (!cached_blocks.empty()) + { + // Get first block + CachedBlocks::iterator begin = cached_blocks.begin(); + + // Get entry-point device ordinal if necessary + if (entrypoint_device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + } + + // Set current device ordinal if necessary + if (begin->device != current_device) + { + if (CubDebug(error = cudaSetDevice(begin->device))) break; + current_device = begin->device; + } + + // Free device memory + if (CubDebug(error = cudaFree(begin->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; + + // Reduce balance and erase entry + cached_bytes[current_device].free -= begin->bytes; + + if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live); + + cached_blocks.erase(begin); + } + + mutex.Unlock(); + + // Attempt to revert back to entry-point device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + } + + + /** + * \brief Destructor + */ + virtual ~CachingDeviceAllocator() + { + if (!skip_cleanup) + FreeAllCached(); + } + +}; + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_arch.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_arch.cuh new file mode 100644 index 0000000..5ec36e5 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_arch.cuh @@ -0,0 +1,151 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Static architectural properties by SM version. + */ + +#pragma once + +#include "util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS) + #define CUB_USE_COOPERATIVE_GROUPS +#endif + +/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). +#ifndef CUB_PTX_ARCH + #ifndef __CUDA_ARCH__ + #define CUB_PTX_ARCH 0 + #else + #define CUB_PTX_ARCH __CUDA_ARCH__ + #endif +#endif + + +/// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. +#ifndef CUB_RUNTIME_FUNCTION + #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) + #define CUB_RUNTIME_ENABLED + #define CUB_RUNTIME_FUNCTION __host__ __device__ + #else + #define CUB_RUNTIME_FUNCTION __host__ + #endif +#endif + + +/// Number of threads per warp +#ifndef CUB_LOG_WARP_THREADS + #define CUB_LOG_WARP_THREADS(arch) \ + (5) + #define CUB_WARP_THREADS(arch) \ + (1 << CUB_LOG_WARP_THREADS(arch)) + + #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) + #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) +#endif + + +/// Number of smem banks +#ifndef CUB_LOG_SMEM_BANKS + #define CUB_LOG_SMEM_BANKS(arch) \ + ((arch >= 200) ? \ + (5) : \ + (4)) + #define CUB_SMEM_BANKS(arch) \ + (1 << CUB_LOG_SMEM_BANKS(arch)) + + #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) + #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) +#endif + + +/// Oversubscription factor +#ifndef CUB_SUBSCRIPTION_FACTOR + #define CUB_SUBSCRIPTION_FACTOR(arch) \ + ((arch >= 300) ? \ + (5) : \ + ((arch >= 200) ? \ + (3) : \ + (10))) + #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) +#endif + + +/// Prefer padding overhead vs X-way conflicts greater than this threshold +#ifndef CUB_PREFER_CONFLICT_OVER_PADDING + #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ + ((arch >= 300) ? \ + (1) : \ + (4)) + #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) +#endif + + +/// Scale down the number of warps to keep same amount of "tile" storage as the nominal configuration for 4B data. Minimum of two warps. +#ifndef CUB_BLOCK_THREADS + #define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ + (CUB_MIN( \ + NOMINAL_4B_BLOCK_THREADS * 2, \ + CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX( \ + (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 3 / 4, \ + (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T)))) +#endif + +/// Scale up/down number of items per thread to keep the same amount of "tile" storage as the nominal configuration for 4B data. Minimum 1 item per thread +#ifndef CUB_ITEMS_PER_THREAD + #define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ + (CUB_MIN( \ + NOMINAL_4B_ITEMS_PER_THREAD * 2, \ + CUB_MAX( \ + 1, \ + (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)))) +#endif + +/// Define both nominal threads-per-block and items-per-thread +#ifndef CUB_NOMINAL_CONFIG + #define CUB_NOMINAL_CONFIG(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T) \ + CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200), \ + CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200) +#endif + + + +#endif // Do not document + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_debug.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_debug.cuh new file mode 100644 index 0000000..1ad60cf --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_debug.cuh @@ -0,0 +1,145 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Error and event logging routines. + * + * The following macros definitions are supported: + * - \p CUB_LOG. Simple event messages are printed to \p stdout. + */ + +#pragma once + +#include +#include "util_namespace.cuh" +#include "util_arch.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/// CUB error reporting macro (prints error messages to stderr) +#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) + #define CUB_STDERR +#endif + + + +/** + * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. + * + * \return The CUDA error. + */ +__host__ __device__ __forceinline__ cudaError_t Debug( + cudaError_t error, + const char* filename, + int line) +{ + (void)filename; + (void)line; +#ifdef CUB_STDERR + if (error) + { + #if (CUB_PTX_ARCH == 0) + fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); + fflush(stderr); + #elif (CUB_PTX_ARCH >= 200) + printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); + #endif + } +#endif + return error; +} + + +/** + * \brief Debug macro + */ +#ifndef CubDebug + #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__) +#endif + + +/** + * \brief Debug macro with exit + */ +#ifndef CubDebugExit + #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); } +#endif + + +/** + * \brief Log macro for printf statements. + */ +#if !defined(_CubLog) + #if !(defined(__clang__) && defined(__CUDA__)) + #if (CUB_PTX_ARCH == 0) + #define _CubLog(format, ...) printf(format,__VA_ARGS__); + #elif (CUB_PTX_ARCH >= 200) + #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__); + #endif + #else + // XXX shameless hack for clang around variadic printf... + // Compilies w/o supplying -std=c++11 but shows warning, + // so we sielence them :) + #pragma clang diagnostic ignored "-Wc++11-extensions" + #pragma clang diagnostic ignored "-Wunnamed-type-template-args" + template + inline __host__ __device__ void va_printf(char const* format, Args const&... args) + { + #ifdef __CUDA_ARCH__ + printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); + #else + printf(format, args...); + #endif + } + #ifndef __CUDA_ARCH__ + #define _CubLog(format, ...) va_printf(format,__VA_ARGS__); + #else + #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__); + #endif + #endif +#endif + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_device.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_device.cuh new file mode 100644 index 0000000..fa73dbd --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_device.cuh @@ -0,0 +1,347 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Properties of a given CUDA device and the corresponding PTX bundle + */ + +#pragma once + +#include "util_type.cuh" +#include "util_arch.cuh" +#include "util_debug.cuh" +#include "util_namespace.cuh" +#include "util_macro.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). + */ +template +__host__ __device__ __forceinline__ +cudaError_t AliasTemporaries( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation + void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed + size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed +{ + const int ALIGN_BYTES = 256; + const int ALIGN_MASK = ~(ALIGN_BYTES - 1); + + // Compute exclusive prefix sum over allocation requests + size_t allocation_offsets[ALLOCATIONS]; + size_t bytes_needed = 0; + for (int i = 0; i < ALLOCATIONS; ++i) + { + size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; + allocation_offsets[i] = bytes_needed; + bytes_needed += allocation_bytes; + } + bytes_needed += ALIGN_BYTES - 1; + + // Check if the caller is simply requesting the size of the storage allocation + if (!d_temp_storage) + { + temp_storage_bytes = bytes_needed; + return cudaSuccess; + } + + // Check if enough storage provided + if (temp_storage_bytes < bytes_needed) + { + return CubDebug(cudaErrorInvalidValue); + } + + // Alias + d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); + for (int i = 0; i < ALLOCATIONS; ++i) + { + allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; + } + + return cudaSuccess; +} + + +/** + * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device + */ +template +__global__ void EmptyKernel(void) { } + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10) + */ +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) +{ + struct Dummy + { + /// Type definition of the EmptyKernel kernel entry point + typedef void (*EmptyKernelPtr)(); + + /// Force EmptyKernel to be generated if this class is used + CUB_RUNTIME_FUNCTION __forceinline__ + EmptyKernelPtr Empty() + { + return EmptyKernel; + } + }; + + +#ifndef CUB_RUNTIME_ENABLED + (void)ptx_version; + + // CUDA API calls not supported from this device + return cudaErrorInvalidConfiguration; + +#elif (CUB_PTX_ARCH > 0) + + ptx_version = CUB_PTX_ARCH; + return cudaSuccess; + +#else + + cudaError_t error = cudaSuccess; + do + { + cudaFuncAttributes empty_kernel_attrs; + if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel))) break; + ptx_version = empty_kernel_attrs.ptxVersion * 10; + } + while (0); + + return error; + +#endif +} + + +/** + * \brief Retrieves the SM version (major * 100 + minor * 10) + */ +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal) +{ +#ifndef CUB_RUNTIME_ENABLED + (void)sm_version; + (void)device_ordinal; + + // CUDA API calls not supported from this device + return cudaErrorInvalidConfiguration; + +#else + + cudaError_t error = cudaSuccess; + do + { + // Fill in SM version + int major, minor; + if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break; + if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break; + sm_version = major * 100 + minor * 10; + } + while (0); + + return error; + +#endif +} + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Synchronize the stream if specified + */ +CUB_RUNTIME_FUNCTION __forceinline__ +static cudaError_t SyncStream(cudaStream_t stream) +{ +#if (CUB_PTX_ARCH == 0) + return cudaStreamSynchronize(stream); +#else + (void)stream; + // Device can't yet sync on a specific stream + return cudaDeviceSynchronize(); +#endif +} + + +/** + * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. + * + * \par Snippet + * The code snippet below illustrates the use of the MaxSmOccupancy function. + * \par + * \code + * #include // or equivalently + * + * template + * __global__ void ExampleKernel() + * { + * // Allocate shared memory for BlockScan + * __shared__ volatile T buffer[4096]; + * + * ... + * } + * + * ... + * + * // Determine SM occupancy for ExampleKernel specialized for unsigned char + * int max_sm_occupancy; + * MaxSmOccupancy(max_sm_occupancy, ExampleKernel, 64); + * + * // max_sm_occupancy <-- 4 on SM10 + * // max_sm_occupancy <-- 8 on SM20 + * // max_sm_occupancy <-- 12 on SM35 + * + * \endcode + * + */ +template +CUB_RUNTIME_FUNCTION __forceinline__ +cudaError_t MaxSmOccupancy( + int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM + KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy + int block_threads, ///< [in] Number of threads per thread block + int dynamic_smem_bytes = 0) +{ +#ifndef CUB_RUNTIME_ENABLED + (void)dynamic_smem_bytes; + (void)block_threads; + (void)kernel_ptr; + (void)max_sm_occupancy; + + // CUDA API calls not supported from this device + return CubDebug(cudaErrorInvalidConfiguration); + +#else + + return cudaOccupancyMaxActiveBlocksPerMultiprocessor ( + &max_sm_occupancy, + kernel_ptr, + block_threads, + dynamic_smem_bytes); + +#endif // CUB_RUNTIME_ENABLED +} + + +/****************************************************************************** + * Policy management + ******************************************************************************/ + +/** + * Kernel dispatch configuration + */ +struct KernelConfig +{ + int block_threads; + int items_per_thread; + int tile_size; + int sm_occupancy; + + CUB_RUNTIME_FUNCTION __forceinline__ + KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {} + + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Init(KernelPtrT kernel_ptr) + { + block_threads = AgentPolicyT::BLOCK_THREADS; + items_per_thread = AgentPolicyT::ITEMS_PER_THREAD; + tile_size = block_threads * items_per_thread; + cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads); + return retval; + } +}; + + + +/// Helper for dispatching into a policy chain +template +struct ChainedPolicy +{ + /// The policy for the active compiler pass + typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy; + + /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Invoke(int ptx_version, FunctorT &op) + { + if (ptx_version < PTX_VERSION) { + return PrevPolicyT::Invoke(ptx_version, op); + } + return op.template Invoke(); + } +}; + +/// Helper for dispatching into a policy chain (end-of-chain specialization) +template +struct ChainedPolicy +{ + /// The policy for the active compiler pass + typedef PolicyT ActivePolicy; + + /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) { + return op.template Invoke(); + } +}; + + + + +#endif // Do not document + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_macro.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_macro.cuh new file mode 100644 index 0000000..73c29d2 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_macro.cuh @@ -0,0 +1,103 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Common C/C++ macro utilities + ******************************************************************************/ + +#pragma once + +#include "util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + +#ifndef CUB_ALIGN + #if defined(_WIN32) || defined(_WIN64) + /// Align struct + #define CUB_ALIGN(bytes) __declspec(align(32)) + #else + /// Align struct + #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) + #endif +#endif + +#ifndef CUB_MAX + /// Select maximum(a, b) + #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) +#endif + +#ifndef CUB_MIN + /// Select minimum(a, b) + #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) +#endif + +#ifndef CUB_QUOTIENT_FLOOR + /// Quotient of x/y rounded down to nearest integer + #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) +#endif + +#ifndef CUB_QUOTIENT_CEILING + /// Quotient of x/y rounded up to nearest integer + #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) +#endif + +#ifndef CUB_ROUND_UP_NEAREST + /// x rounded up to the nearest multiple of y + #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) +#endif + +#ifndef CUB_ROUND_DOWN_NEAREST + /// x rounded down to the nearest multiple of y + #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) +#endif + + +#ifndef CUB_STATIC_ASSERT + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + #define CUB_CAT_(a, b) a ## b + #define CUB_CAT(a, b) CUB_CAT_(a, b) + #endif // DOXYGEN_SHOULD_SKIP_THIS + + /// Static assert + #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] +#endif + +/** @} */ // end group UtilModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_namespace.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_namespace.cuh new file mode 100644 index 0000000..edb6126 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_namespace.cuh @@ -0,0 +1,46 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Place-holder for prefixing the cub namespace + */ + +#pragma once + +// For example: +//#define CUB_NS_PREFIX namespace thrust{ namespace detail { +//#define CUB_NS_POSTFIX } } + +#ifndef CUB_NS_PREFIX +#define CUB_NS_PREFIX +#endif + +#ifndef CUB_NS_POSTFIX +#define CUB_NS_POSTFIX +#endif diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_ptx.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_ptx.cuh new file mode 100644 index 0000000..fae6e4f --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_ptx.cuh @@ -0,0 +1,729 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * PTX intrinsics + */ + + +#pragma once + +#include "util_type.cuh" +#include "util_arch.cuh" +#include "util_namespace.cuh" +#include "util_debug.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilPtx + * @{ + */ + + +/****************************************************************************** + * PTX helper macros + ******************************************************************************/ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Register modifier for pointer-types (for inlining PTX assembly) + */ +#if defined(_WIN64) || defined(__LP64__) + #define __CUB_LP64__ 1 + // 64-bit register modifier for inlined asm + #define _CUB_ASM_PTR_ "l" + #define _CUB_ASM_PTR_SIZE_ "u64" +#else + #define __CUB_LP64__ 0 + // 32-bit register modifier for inlined asm + #define _CUB_ASM_PTR_ "r" + #define _CUB_ASM_PTR_SIZE_ "u32" +#endif + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Inlined PTX intrinsics + ******************************************************************************/ + +/** + * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. + */ +__device__ __forceinline__ unsigned int SHR_ADD( + unsigned int x, + unsigned int shift, + unsigned int addend) +{ + unsigned int ret; +#if CUB_PTX_ARCH >= 200 + asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : + "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); +#else + ret = (x >> shift) + addend; +#endif + return ret; +} + + +/** + * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. + */ +__device__ __forceinline__ unsigned int SHL_ADD( + unsigned int x, + unsigned int shift, + unsigned int addend) +{ + unsigned int ret; +#if CUB_PTX_ARCH >= 200 + asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : + "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); +#else + ret = (x << shift) + addend; +#endif + return ret; +} + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Bitfield-extract. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits, + Int2Type /*byte_len*/) +{ + unsigned int bits; +#if CUB_PTX_ARCH >= 200 + asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); +#else + const unsigned int MASK = (1 << num_bits) - 1; + bits = (source >> bit_start) & MASK; +#endif + return bits; +} + + +/** + * Bitfield-extract for 64-bit types. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits, + Int2Type<8> /*byte_len*/) +{ + const unsigned long long MASK = (1ull << num_bits) - 1; + return (source >> bit_start) & MASK; +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits) +{ + return BFE(source, bit_start, num_bits, Int2Type()); +} + + +/** + * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. + */ +__device__ __forceinline__ void BFI( + unsigned int &ret, + unsigned int x, + unsigned int y, + unsigned int bit_start, + unsigned int num_bits) +{ +#if CUB_PTX_ARCH >= 200 + asm ("bfi.b32 %0, %1, %2, %3, %4;" : + "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); +#else + x <<= bit_start; + unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; + unsigned int MASK_Y = ~MASK_X; + ret = (y & MASK_Y) | (x & MASK_X); +#endif +} + + +/** + * \brief Three-operand add. Returns \p x + \p y + \p z. + */ +__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) +{ +#if CUB_PTX_ARCH >= 200 + asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); +#else + x = x + y + z; +#endif + return x; +} + + +/** + * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. + * + * \par + * The bytes in the two source registers \p a and \p b are numbered from 0 to 7: + * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes + * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within + * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0} + * + * \par Snippet + * The code snippet below illustrates byte-permute. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * int a = 0x03020100; + * int b = 0x07060504; + * int index = 0x00007531; + * + * int selected = PRMT(a, b, index); // 0x07050301 + * + * \endcode + * + */ +__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index) +{ + int ret; + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); + return ret; +} + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Sync-threads barrier. + */ +__device__ __forceinline__ void BAR(int count) +{ + asm volatile("bar.sync 1, %0;" : : "r"(count)); +} + +/** + * CTA barrier + */ +__device__ __forceinline__ void CTA_SYNC() +{ + __syncthreads(); +} + + +/** + * CTA barrier with predicate + */ +__device__ __forceinline__ int CTA_SYNC_AND(int p) +{ + return __syncthreads_and(p); +} + + +/** + * Warp barrier + */ +__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + __syncwarp(member_mask); +#endif +} + + +/** + * Warp any + */ +__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __any_sync(member_mask, predicate); +#else + return ::__any(predicate); +#endif +} + + +/** + * Warp any + */ +__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __all_sync(member_mask, predicate); +#else + return ::__all(predicate); +#endif +} + + +/** + * Warp ballot + */ +__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __ballot_sync(member_mask, predicate); +#else + return __ballot(predicate); +#endif +} + +/** + * Warp synchronous shfl_up + */ +__device__ __forceinline__ +unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int first_lane, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane), "r"(member_mask)); +#else + asm volatile("shfl.up.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane)); +#endif + return word; +} + +/** + * Warp synchronous shfl_down + */ +__device__ __forceinline__ +unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int last_lane, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane), "r"(member_mask)); +#else + asm volatile("shfl.down.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane)); +#endif + return word; +} + +/** + * Warp synchronous shfl_idx + */ +__device__ __forceinline__ +unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int last_lane, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane), "r"(member_mask)); +#else + asm volatile("shfl.idx.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane)); +#endif + return word; +} + +/** + * Floating point multiply. (Mantissa LSB rounds towards zero.) + */ +__device__ __forceinline__ float FMUL_RZ(float a, float b) +{ + float d; + asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); + return d; +} + + +/** + * Floating point multiply-add. (Mantissa LSB rounds towards zero.) + */ +__device__ __forceinline__ float FFMA_RZ(float a, float b, float c) +{ + float d; + asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); + return d; +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Terminates the calling thread + */ +__device__ __forceinline__ void ThreadExit() { + asm volatile("exit;"); +} + + +/** + * \brief Abort execution and generate an interrupt to the host CPU + */ +__device__ __forceinline__ void ThreadTrap() { + asm volatile("trap;"); +} + + +/** + * \brief Returns the row-major linear thread identifier for a multidimensional thread block + */ +__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z) +{ + return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) + + ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + + threadIdx.x; +} + + +/** + * \brief Returns the warp lane ID of the calling thread + */ +__device__ __forceinline__ unsigned int LaneId() +{ + unsigned int ret; + asm ("mov.u32 %0, %%laneid;" : "=r"(ret) ); + return ret; +} + + +/** + * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block. + */ +__device__ __forceinline__ unsigned int WarpId() +{ + unsigned int ret; + asm ("mov.u32 %0, %%warpid;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes less than the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskLt() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskLe() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes greater than the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskGt() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskGe() +{ + unsigned int ret; + asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) ); + return ret; +} + +/** @} */ // end group UtilPtx + + + + +/** + * \brief Shuffle-up for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei-src_offset. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png) + * \ingroup WarpModule + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from the + * predecessor of its predecessor. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from two ranks below + * double peer_data = ShuffleUp(thread_data, 2, 0, 0xffffffff); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}. + * + */ +template +__device__ __forceinline__ T ShuffleUp( + T input, ///< [in] The value to broadcast + int src_offset, ///< [in] The relative down-offset of the peer to read from + int first_lane, ///< [in] Index of first lane in segment (typically 0) + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +{ + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + unsigned int shuffle_word; + shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_lane, member_mask); + output_alias[0] = shuffle_word; + + #pragma unroll + for (int WORD = 1; WORD < WORDS; ++WORD) + { + shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_lane, member_mask); + output_alias[WORD] = shuffle_word; + } + + return output; +} + + +/** + * \brief Shuffle-down for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei+src_offset. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png) + * \ingroup WarpModule + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from the + * successor of its successor. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from two ranks below + * double peer_data = ShuffleDown(thread_data, 2, 31, 0xffffffff); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}. + * + */ +template +__device__ __forceinline__ T ShuffleDown( + T input, ///< [in] The value to broadcast + int src_offset, ///< [in] The relative up-offset of the peer to read from + int last_lane, ///< [in] Index of first lane in segment (typically 31) + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +{ + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + unsigned int shuffle_word; + shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_lane, member_mask); + output_alias[0] = shuffle_word; + + #pragma unroll + for (int WORD = 1; WORD < WORDS; ++WORD) + { + shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_lane, member_mask); + output_alias[WORD] = shuffle_word; + } + + return output; +} + + +/** + * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input + * contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, + * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) + * + * \ingroup WarpModule + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from warp-lane0. + * + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from thread 0 + * double peer_data = ShuffleIndex(thread_data, 0, 32, 0xffffffff); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}. + * + */ +template +__device__ __forceinline__ T ShuffleIndex( + T input, ///< [in] The value to broadcast + int src_lane, ///< [in] Which warp lane is to do the broadcasting + int logical_warp_threads, ///< [in] Number of threads per logical warp + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +{ + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + unsigned int shuffle_word; + shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0], + src_lane, + logical_warp_threads - 1, + member_mask); + + output_alias[0] = shuffle_word; + + #pragma unroll + for (int WORD = 1; WORD < WORDS; ++WORD) + { + shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD], + src_lane, + logical_warp_threads - 1, + member_mask); + + output_alias[WORD] = shuffle_word; + } + + return output; +} + + + +/** + * Compute a 32b mask of threads having the same least-significant + * LABEL_BITS of \p label as the calling thread. + */ +template +inline __device__ unsigned int MatchAny(unsigned int label) +{ + unsigned int retval; + + // Extract masks of common threads for each bit + #pragma unroll + for (int BIT = 0; BIT < LABEL_BITS; ++BIT) + { + unsigned int mask; + unsigned int current_bit = 1 << BIT; + asm ("{\n" + " .reg .pred p;\n" + " and.b32 %0, %1, %2;" + " setp.eq.u32 p, %0, %2;\n" +#ifdef CUB_USE_COOPERATIVE_GROUPS + " vote.ballot.sync.b32 %0, p, 0xffffffff;\n" +#else + " vote.ballot.b32 %0, p;\n" +#endif + " @!p not.b32 %0, %0;\n" + "}\n" : "=r"(mask) : "r"(label), "r"(current_bit)); + + // Remove peers who differ + retval = (BIT == 0) ? mask : retval & mask; + } + + return retval; + +// // VOLTA match +// unsigned int retval; +// asm ("{\n" +// " match.any.sync.b32 %0, %1, 0xffffffff;\n" +// "}\n" : "=r"(retval) : "r"(label)); +// return retval; + +} + + + + + + + + + + + + + + + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_type.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_type.cuh new file mode 100644 index 0000000..7de5427 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_type.cuh @@ -0,0 +1,1141 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Common type manipulation (metaprogramming) utilities + */ + +#pragma once + +#include +#include +#include + +#include "util_macro.cuh" +#include "util_arch.cuh" +#include "util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + + + +/****************************************************************************** + * Type equality + ******************************************************************************/ + +/** + * \brief Type selection (IF ? ThenType : ElseType) + */ +template +struct If +{ + /// Conditional type result + typedef ThenType Type; // true +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct If +{ + typedef ElseType Type; // false +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Conditional types + ******************************************************************************/ + +/** + * \brief Type equality test + */ +template +struct Equals +{ + enum { + VALUE = 0, + NEGATE = 1 + }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct Equals +{ + enum { + VALUE = 1, + NEGATE = 0 + }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Static math + ******************************************************************************/ + +/** + * \brief Statically determine log2(N), rounded up. + * + * For example: + * Log2<8>::VALUE // 3 + * Log2<3>::VALUE // 2 + */ +template +struct Log2 +{ + /// Static logarithm value + enum { VALUE = Log2> 1), COUNT + 1>::VALUE }; // Inductive case +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct Log2 +{ + enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case + COUNT : + COUNT - 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** + * \brief Statically determine if N is a power-of-two + */ +template +struct PowerOfTwo +{ + enum { VALUE = ((N & (N - 1)) == 0) }; +}; + + + +/****************************************************************************** + * Pointer vs. iterator detection + ******************************************************************************/ + +/** + * \brief Pointer vs. iterator + */ +template +struct IsPointer +{ + enum { VALUE = 0 }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct IsPointer +{ + enum { VALUE = 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Qualifier detection + ******************************************************************************/ + +/** + * \brief Volatile modifier test + */ +template +struct IsVolatile +{ + enum { VALUE = 0 }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct IsVolatile +{ + enum { VALUE = 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Qualifier removal + ******************************************************************************/ + +/** + * \brief Removes \p const and \p volatile qualifiers from type \p Tp. + * + * For example: + * typename RemoveQualifiers::Type // int; + */ +template +struct RemoveQualifiers +{ + /// Type without \p const and \p volatile qualifiers + typedef Up Type; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + + +/****************************************************************************** + * Marker types + ******************************************************************************/ + +/** + * \brief A simple "NULL" marker type + */ +struct NullType +{ +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + template + __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; } + + __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; } + + __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; } + +#endif // DOXYGEN_SHOULD_SKIP_THIS +}; + + +/** + * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values) + */ +template +struct Int2Type +{ + enum {VALUE = A}; +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/****************************************************************************** + * Size and alignment + ******************************************************************************/ + +/// Structure alignment +template +struct AlignBytes +{ + struct Pad + { + T val; + char byte; + }; + + enum + { + /// The "true CUDA" alignment of T in bytes + ALIGN_BYTES = sizeof(Pad) - sizeof(T) + }; + + /// The "truly aligned" type + typedef T Type; +}; + +// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree +// with device C++ compilers (EDG) on types passed as template parameters through +// kernel functions + +#define __CUB_ALIGN_BYTES(t, b) \ + template <> struct AlignBytes \ + { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; }; + +__CUB_ALIGN_BYTES(short4, 8) +__CUB_ALIGN_BYTES(ushort4, 8) +__CUB_ALIGN_BYTES(int2, 8) +__CUB_ALIGN_BYTES(uint2, 8) +__CUB_ALIGN_BYTES(long long, 8) +__CUB_ALIGN_BYTES(unsigned long long, 8) +__CUB_ALIGN_BYTES(float2, 8) +__CUB_ALIGN_BYTES(double, 8) +#ifdef _WIN32 + __CUB_ALIGN_BYTES(long2, 8) + __CUB_ALIGN_BYTES(ulong2, 8) +#else + __CUB_ALIGN_BYTES(long2, 16) + __CUB_ALIGN_BYTES(ulong2, 16) +#endif +__CUB_ALIGN_BYTES(int4, 16) +__CUB_ALIGN_BYTES(uint4, 16) +__CUB_ALIGN_BYTES(float4, 16) +__CUB_ALIGN_BYTES(long4, 16) +__CUB_ALIGN_BYTES(ulong4, 16) +__CUB_ALIGN_BYTES(longlong2, 16) +__CUB_ALIGN_BYTES(ulonglong2, 16) +__CUB_ALIGN_BYTES(double2, 16) +__CUB_ALIGN_BYTES(longlong4, 16) +__CUB_ALIGN_BYTES(ulonglong4, 16) +__CUB_ALIGN_BYTES(double4, 16) + +template struct AlignBytes : AlignBytes {}; +template struct AlignBytes : AlignBytes {}; +template struct AlignBytes : AlignBytes {}; + + +/// Unit-words of data movement +template +struct UnitWord +{ + enum { + ALIGN_BYTES = AlignBytes::ALIGN_BYTES + }; + + template + struct IsMultiple + { + enum { + UNIT_ALIGN_BYTES = AlignBytes::ALIGN_BYTES, + IS_MULTIPLE = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0) + }; + }; + + /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + unsigned int, + typename If::IS_MULTIPLE, + unsigned short, + unsigned char>::Type>::Type ShuffleWord; + + /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + unsigned long long, + ShuffleWord>::Type VolatileWord; + + /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + ulonglong2, + VolatileWord>::Type DeviceWord; + + /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + uint4, + typename If::IS_MULTIPLE, + uint2, + ShuffleWord>::Type>::Type TextureWord; +}; + + +// float2 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef int ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef float VolatileWord; + typedef uint2 DeviceWord; +#else + typedef unsigned long long VolatileWord; + typedef unsigned long long DeviceWord; +#endif + typedef float2 TextureWord; +}; + +// float4 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef int ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef float VolatileWord; + typedef uint4 DeviceWord; +#else + typedef unsigned long long VolatileWord; + typedef ulonglong2 DeviceWord; +#endif + typedef float4 TextureWord; +}; + + +// char2 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef unsigned short ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef unsigned short VolatileWord; + typedef short DeviceWord; +#else + typedef unsigned short VolatileWord; + typedef unsigned short DeviceWord; +#endif + typedef unsigned short TextureWord; +}; + + +template struct UnitWord : UnitWord {}; +template struct UnitWord : UnitWord {}; +template struct UnitWord : UnitWord {}; + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Vector type inference utilities. + ******************************************************************************/ + +/** + * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists. Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields. + */ +template struct CubVector; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +enum +{ + /// The maximum number of elements in CUDA vector types + MAX_VEC_ELEMENTS = 4, +}; + + +/** + * Generic vector-1 type + */ +template +struct CubVector +{ + T x; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-2 type + */ +template +struct CubVector +{ + T x; + T y; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-3 type + */ +template +struct CubVector +{ + T x; + T y; + T z; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-4 type + */ +template +struct CubVector +{ + T x; + T y; + T z; + T w; + + typedef T BaseType; + typedef CubVector Type; +}; + + +/** + * Macro for expanding partially-specialized built-in vector types + */ +#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type) \ + \ + template<> struct CubVector : short_type##1 \ + { \ + typedef base_type BaseType; \ + typedef short_type##1 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##2 \ + { \ + typedef base_type BaseType; \ + typedef short_type##2 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##3 \ + { \ + typedef base_type BaseType; \ + typedef short_type##3 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + retval.z = z + other.z; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + retval.z = z - other.z; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##4 \ + { \ + typedef base_type BaseType; \ + typedef short_type##4 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + retval.z = z + other.z; \ + retval.w = w + other.w; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + retval.z = z - other.z; \ + retval.w = w - other.w; \ + return retval; \ + } \ + }; + + + +// Expand CUDA vector types for built-in primitives +CUB_DEFINE_VECTOR_TYPE(char, char) +CUB_DEFINE_VECTOR_TYPE(signed char, char) +CUB_DEFINE_VECTOR_TYPE(short, short) +CUB_DEFINE_VECTOR_TYPE(int, int) +CUB_DEFINE_VECTOR_TYPE(long, long) +CUB_DEFINE_VECTOR_TYPE(long long, longlong) +CUB_DEFINE_VECTOR_TYPE(unsigned char, uchar) +CUB_DEFINE_VECTOR_TYPE(unsigned short, ushort) +CUB_DEFINE_VECTOR_TYPE(unsigned int, uint) +CUB_DEFINE_VECTOR_TYPE(unsigned long, ulong) +CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong) +CUB_DEFINE_VECTOR_TYPE(float, float) +CUB_DEFINE_VECTOR_TYPE(double, double) +CUB_DEFINE_VECTOR_TYPE(bool, uchar) + +// Undefine macros +#undef CUB_DEFINE_VECTOR_TYPE + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Wrapper types + ******************************************************************************/ + +/** + * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions + */ +template +struct Uninitialized +{ + /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T + typedef typename UnitWord::DeviceWord DeviceWord; + + enum + { + WORDS = sizeof(T) / sizeof(DeviceWord) + }; + + /// Backing storage + DeviceWord storage[WORDS]; + + /// Alias + __host__ __device__ __forceinline__ T& Alias() + { + return reinterpret_cast(*this); + } +}; + + +/** + * \brief A key identifier paired with a corresponding value + */ +template < + typename _Key, + typename _Value +#if defined(_WIN32) && !defined(_WIN64) + , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES) + , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES) +#endif // #if defined(_WIN32) && !defined(_WIN64) + > +struct KeyValuePair +{ + typedef _Key Key; ///< Key data type + typedef _Value Value; ///< Value data type + + Key key; ///< Item key + Value value; ///< Item value + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) + { + return (value != b.value) || (key != b.key); + } +}; + +#if defined(_WIN32) && !defined(_WIN64) + +/** + * Win32 won't do 16B alignment. This can present two problems for + * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members: + * 1) If a smaller-aligned item were to be listed first, the host compiler places the + * should-be-16B item at too early an offset (and disagrees with device compiler) + * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size + * of the struct wrong (and disagrees with device compiler) + * + * So we put the larger-should-be-aligned item first, and explicitly pad the + * end of the struct + */ + +/// Smaller key specialization +template +struct KeyValuePair +{ + typedef K Key; + typedef V Value; + + typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; + + Value value; // Value has larger would-be alignment and goes first + Key key; + Pad pad; + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) + { + return (value != b.value) || (key != b.key); + } +}; + + +/// Smaller value specialization +template +struct KeyValuePair +{ + typedef K Key; + typedef V Value; + + typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; + + Key key; // Key has larger would-be alignment and goes first + Value value; + Pad pad; + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) + { + return (value != b.value) || (key != b.key); + } +}; + +#endif // #if defined(_WIN32) && !defined(_WIN64) + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * \brief A wrapper for passing simple static arrays as kernel parameters + */ +template +struct ArrayWrapper +{ + + /// Statically-sized array of type \p T + T array[COUNT]; + + /// Constructor + __host__ __device__ __forceinline__ ArrayWrapper() {} +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth. + * + * Many multi-pass computations require a pair of "ping-pong" storage + * buffers (e.g., one for reading from and the other for writing to, and then + * vice-versa for the subsequent pass). This structure wraps a set of device + * buffers and a "selector" member to track which is "current". + */ +template +struct DoubleBuffer +{ + /// Pair of device buffer pointers + T *d_buffers[2]; + + /// Selector into \p d_buffers (i.e., the active/valid buffer) + int selector; + + /// \brief Constructor + __host__ __device__ __forceinline__ DoubleBuffer() + { + selector = 0; + d_buffers[0] = NULL; + d_buffers[1] = NULL; + } + + /// \brief Constructor + __host__ __device__ __forceinline__ DoubleBuffer( + T *d_current, ///< The currently valid buffer + T *d_alternate) ///< Alternate storage buffer of the same size as \p d_current + { + selector = 0; + d_buffers[0] = d_current; + d_buffers[1] = d_alternate; + } + + /// \brief Return pointer to the currently valid buffer + __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; } + + /// \brief Return pointer to the currently invalid buffer + __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; } + +}; + + + +/****************************************************************************** + * Typedef-detection + ******************************************************************************/ + + +/** + * \brief Defines a structure \p detector_name that is templated on type \p T. The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name + */ +#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \ + template \ + struct detector_name \ + { \ + template \ + static char& test(typename C::nested_type_name*); \ + template \ + static int& test(...); \ + enum \ + { \ + VALUE = sizeof(test(0)) < sizeof(int) \ + }; \ + }; + + + +/****************************************************************************** + * Simple enable-if (similar to Boost) + ******************************************************************************/ + +/** + * \brief Simple enable-if (similar to Boost) + */ +template +struct EnableIf +{ + /// Enable-if type for SFINAE dummy variables + typedef T Type; +}; + + +template +struct EnableIf {}; + + + +/****************************************************************************** + * Typedef-detection + ******************************************************************************/ + +/** + * \brief Determine whether or not BinaryOp's functor is of the form bool operator()(const T& a, const T&b) or bool operator()(const T& a, const T&b, unsigned int idx) + */ +template +struct BinaryOpHasIdxParam +{ +private: +/* + template struct SFINAE1 {}; + template struct SFINAE2 {}; + template struct SFINAE3 {}; + template struct SFINAE4 {}; +*/ + template struct SFINAE5 {}; + template struct SFINAE6 {}; + template struct SFINAE7 {}; + template struct SFINAE8 {}; +/* + template static char Test(SFINAE1 *); + template static char Test(SFINAE2 *); + template static char Test(SFINAE3 *); + template static char Test(SFINAE4 *); +*/ + template static char Test(SFINAE5 *); + template static char Test(SFINAE6 *); + template static char Test(SFINAE7 *); + template static char Test(SFINAE8 *); + + template static int Test(...); + +public: + + /// Whether the functor BinaryOp has a third unsigned int index param + static const bool HAS_PARAM = sizeof(Test(NULL)) == sizeof(char); +}; + + + + +/****************************************************************************** + * Simple type traits utilities. + * + * For example: + * Traits::CATEGORY // SIGNED_INTEGER + * Traits::NULL_TYPE // true + * Traits::CATEGORY // NOT_A_NUMBER + * Traits::PRIMITIVE; // false + * + ******************************************************************************/ + +/** + * \brief Basic type traits categories + */ +enum Category +{ + NOT_A_NUMBER, + SIGNED_INTEGER, + UNSIGNED_INTEGER, + FLOATING_POINT +}; + + +/** + * \brief Basic type traits + */ +template +struct BaseTraits +{ + /// Category + static const Category CATEGORY = _CATEGORY; + enum + { + PRIMITIVE = _PRIMITIVE, + NULL_TYPE = _NULL_TYPE, + }; +}; + + +/** + * Basic type traits (unsigned primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = UNSIGNED_INTEGER; + static const UnsignedBits LOWEST_KEY = UnsignedBits(0); + static const UnsignedBits MAX_KEY = UnsignedBits(-1); + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + return key; + } + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + return key; + } + + static __host__ __device__ __forceinline__ T Max() + { + UnsignedBits retval = MAX_KEY; + return reinterpret_cast(retval); + } + + static __host__ __device__ __forceinline__ T Lowest() + { + UnsignedBits retval = LOWEST_KEY; + return reinterpret_cast(retval); + } +}; + + +/** + * Basic type traits (signed primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = SIGNED_INTEGER; + static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); + static const UnsignedBits LOWEST_KEY = HIGH_BIT; + static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + return key ^ HIGH_BIT; + }; + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + return key ^ HIGH_BIT; + }; + + static __host__ __device__ __forceinline__ T Max() + { + UnsignedBits retval = MAX_KEY; + return reinterpret_cast(retval); + } + + static __host__ __device__ __forceinline__ T Lowest() + { + UnsignedBits retval = LOWEST_KEY; + return reinterpret_cast(retval); + } +}; + +template +struct FpLimits; + +template <> +struct FpLimits +{ + static __host__ __device__ __forceinline__ float Max() { + return FLT_MAX; + } + + static __host__ __device__ __forceinline__ float Lowest() { + return FLT_MAX * float(-1); + } +}; + +template <> +struct FpLimits +{ + static __host__ __device__ __forceinline__ double Max() { + return DBL_MAX; + } + + static __host__ __device__ __forceinline__ double Lowest() { + return DBL_MAX * double(-1); + } +}; + + +/** + * Basic type traits (fp primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = FLOATING_POINT; + static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); + static const UnsignedBits LOWEST_KEY = UnsignedBits(-1); + static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT; + return key ^ mask; + }; + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1); + return key ^ mask; + }; + + static __host__ __device__ __forceinline__ T Max() { + return FpLimits::Max(); + } + + static __host__ __device__ __forceinline__ T Lowest() { + return FpLimits::Lowest(); + } +}; + + +/** + * \brief Numeric type traits + */ +template struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; + + + +/** + * \brief Type traits + */ +template +struct Traits : NumericTraits::Type> {}; + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_shfl.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_shfl.cuh new file mode 100644 index 0000000..682a5bf --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_shfl.cuh @@ -0,0 +1,551 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../util_ptx.cuh" +#include "../../util_type.cuh" +#include "../../util_macro.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. + * + * LOGICAL_WARP_THREADS must be a power-of-two + */ +template < + typename T, ///< Data type being reduced + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpReduceShfl +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// The number of warp reduction steps + STEPS = Log2::VALUE, + + /// Number of logical warps in a PTX warp + LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS, + }; + + template + struct IsInteger + { + enum { + ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange + IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) + }; + }; + + + // Creates a mask where the last thread in each logical warp is set + template + struct LastLaneMask + { + enum { + BASE_MASK = 1 << (LOGICAL_WARP_THREADS - 1), + MASK = (LastLaneMask::MASK << LOGICAL_WARP_THREADS) | BASE_MASK, + }; + }; + + // Creates a mask where the last thread in each logical warp is set + template + struct LastLaneMask + { + enum { + MASK = 1 << (LOGICAL_WARP_THREADS - 1), + }; + }; + + + + /// Shared memory storage layout type + typedef NullType TempStorage; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + + unsigned int lane_id; + + unsigned int member_mask; + + //--------------------------------------------------------------------- + // Construction + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ WarpReduceShfl( + TempStorage &/*temp_storage*/) + : + lane_id(LaneId()), + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ? + 0 : // arch-width subwarps need not be tiled within the arch-warp + ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) + {} + + + //--------------------------------------------------------------------- + // Reduction steps + //--------------------------------------------------------------------- + + /// Reduction (specialized for summation across uint32 types) + __device__ __forceinline__ unsigned int ReduceStep( + unsigned int input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned int output; + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0|p, %1, %2, %3;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across fp32 types) + __device__ __forceinline__ float ReduceStep( + float input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + float output; + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0|p, %1, %2, %3;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across unsigned long long types) + __device__ __forceinline__ unsigned long long ReduceStep( + unsigned long long input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned long long output; + +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 %0, {lo, hi};" + " @p add.u64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 %0, {lo, hi};" + " @p add.u64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across long long types) + __device__ __forceinline__ long long ReduceStep( + long long input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + long long output; + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 %0, {lo, hi};" + " @p add.s64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 %0, {lo, hi};" + " @p add.s64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across double types) + __device__ __forceinline__ double ReduceStep( + double input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + double output; + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane)); +#endif + + return output; + } + + + /// Reduction (specialized for swizzled ReduceByKeyOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePair ReduceStep( + KeyValuePair input, ///< [in] Calling thread's input item. + SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask); + + output.key = input.key; + output.value = ReduceStep( + input.value, + cub::Sum(), + last_lane, + offset, + Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key != other_key) + output.value = input.value; + + return output; + } + + + + /// Reduction (specialized for swizzled ReduceBySegmentOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePair ReduceStep( + KeyValuePair input, ///< [in] Calling thread's input item. + SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key > 0) + output.value = input.value; + + return output; + } + + + /// Reduction step (generic) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + _T output = input; + + _T temp = ShuffleDown(output, offset, last_lane, member_mask); + + // Perform reduction op if valid + if (offset + lane_id <= last_lane) + output = reduction_op(input, temp); + + return output; + } + + + /// Reduction step (specialized for small unsigned integers size 32b or less) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer + { + return ReduceStep(input, reduction_op, last_lane, offset); + } + + + /// Reduction step (specialized for types other than small unsigned integers size 32b or less) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer + { + return ReduceStep(input, reduction_op, last_lane, offset); + } + + + //--------------------------------------------------------------------- + // Templated inclusive scan iteration + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ReduceStep( + T& input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + Int2Type /*step*/) + { + input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); + + ReduceStep(input, reduction_op, last_lane, Int2Type()); + } + + template + __device__ __forceinline__ void ReduceStep( + T& /*input*/, ///< [in] Calling thread's input item. + ReductionOp /*reduction_op*/, ///< [in] Binary reduction operator + int /*last_lane*/, ///< [in] Index of last lane in segment + Int2Type /*step*/) + {} + + + //--------------------------------------------------------------------- + // Reduction operations + //--------------------------------------------------------------------- + + /// Reduction + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + // Get the lane of the first and last thread in the logical warp + int first_thread = 0; + int last_thread = LOGICAL_WARP_THREADS - 1; + if (!IS_ARCH_WARP) + { + first_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1)); + last_thread |= lane_id; + } + + // Common case is FOLDED_ITEMS_PER_LANE = 1 (or a multiple of 32) + int lanes_with_valid_data = (folded_items_per_warp - 1) / FOLDED_ITEMS_PER_LANE; + + // Get the last valid lane + int last_lane = (ALL_LANES_VALID) ? + last_thread : + CUB_MIN(last_thread, first_thread + lanes_with_valid_data); + + T output = input; + +// // Iterate reduction steps +// #pragma unroll +// for (int STEP = 0; STEP < STEPS; STEP++) +// { +// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); +// } + + // Template-iterate reduction steps + ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); + + return output; + } + + + /// Segmented reduction + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + // Get the start flags for each thread in the warp. + int warp_flags = WARP_BALLOT(flag, member_mask); + + // Convert to tail-segmented + if (HEAD_SEGMENTED) + warp_flags >>= 1; + + // Mask in the last lanes of each logical warp + warp_flags |= LastLaneMask<1, LOGICAL_WARPS>::MASK; + + // Mask out the bits below the current thread + warp_flags &= LaneMaskGe(); + + // Find the next set flag + int last_lane = __clz(__brev(warp_flags)); + + T output = input; + +// // Iterate reduction steps +// #pragma unroll +// for (int STEP = 0; STEP < STEPS; STEP++) +// { +// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); +// } + + // Template-iterate reduction steps + ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); + + return output; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_smem.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_smem.cuh new file mode 100644 index 0000000..9ba8e94 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_smem.cuh @@ -0,0 +1,375 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../thread/thread_load.cuh" +#include "../../thread/thread_store.cuh" +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being reduced + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpReduceSmem +{ + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The number of threads in half a warp + HALF_WARP_THREADS = 1 << (STEPS - 1), + + /// The number of shared memory elements per warp + WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, + + /// FlagT status (when not using ballot) + UNSET = 0x0, // Is initially unset + SET = 0x1, // Is initially set + SEEN = 0x2, // Has seen another head flag from a successor peer + }; + + /// Shared memory flag type + typedef unsigned char SmemFlag; + + /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) + struct _TempStorage + { + T reduce[WARP_SMEM_ELEMENTS]; + SmemFlag flags[WARP_SMEM_ELEMENTS]; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + unsigned int lane_id; + unsigned int member_mask; + + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpReduceSmem( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS), + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? + 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp + ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) + {} + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Regular reduction + //--------------------------------------------------------------------- + + /** + * Reduction step + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane + typename ReductionOp, + int STEP> + __device__ __forceinline__ T ReduceStep( + T input, ///< [in] Calling thread's input + int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*step*/) + { + const int OFFSET = 1 << STEP; + + // Share input through buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Update input if peer_addend is in range + if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp)) + { + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + input = reduction_op(input, peer_addend); + } + + WARP_SYNC(member_mask); + + return ReduceStep(input, folded_items_per_warp, reduction_op, Int2Type()); + } + + + /** + * Reduction step (terminate) + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane + typename ReductionOp> + __device__ __forceinline__ T ReduceStep( + T input, ///< [in] Calling thread's input + int /*folded_items_per_warp*/, ///< [in] Total number of valid items folded into each logical warp + ReductionOp /*reduction_op*/, ///< [in] Reduction operator + Int2Type /*step*/) + { + return input; + } + + + //--------------------------------------------------------------------- + // Segmented reduction + //--------------------------------------------------------------------- + + + /** + * Ballot-based segmented reduce + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality + { + // Get the start flags for each thread in the warp. + int warp_flags = WARP_BALLOT(flag, member_mask); + + if (!HEAD_SEGMENTED) + warp_flags <<= 1; + + // Keep bits above the current thread. + warp_flags &= LaneMaskGt(); + + // Accommodate packing of multiple logical warps in a single physical warp + if (!IS_ARCH_WARP) + { + warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; + } + + // Find next flag + int next_flag = __clz(__brev(warp_flags)); + + // Clip the next segment at the warp boundary if necessary + if (LOGICAL_WARP_THREADS != 32) + next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); + + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + // Share input into buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Update input if peer_addend is in range + if (OFFSET + lane_id < next_flag) + { + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + input = reduction_op(input, peer_addend); + } + + WARP_SYNC(member_mask); + } + + return input; + } + + + /** + * Smem-based segmented reduce + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality + { + enum + { + UNSET = 0x0, // Is initially unset + SET = 0x1, // Is initially set + SEEN = 0x2, // Has seen another head flag from a successor peer + }; + + // Alias flags onto shared data storage + volatile SmemFlag *flag_storage = temp_storage.flags; + + SmemFlag flag_status = (flag) ? SET : UNSET; + + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + // Share input through buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Get peer from buffer + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + + WARP_SYNC(member_mask); + + // Share flag through buffer + flag_storage[lane_id] = flag_status; + + // Get peer flag from buffer + SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET]; + + // Update input if peer was in range + if (lane_id < LOGICAL_WARP_THREADS - OFFSET) + { + if (HEAD_SEGMENTED) + { + // Head-segmented + if ((flag_status & SEEN) == 0) + { + // Has not seen a more distant head flag + if (peer_flag_status & SET) + { + // Has now seen a head flag + flag_status |= SEEN; + } + else + { + // Peer is not a head flag: grab its count + input = reduction_op(input, peer_addend); + } + + // Update seen status to include that of peer + flag_status |= (peer_flag_status & SEEN); + } + } + else + { + // Tail-segmented. Simply propagate flag status + if (!flag_status) + { + input = reduction_op(input, peer_addend); + flag_status |= peer_flag_status; + } + + } + } + } + + return input; + } + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + /** + * Reduction + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp + ReductionOp reduction_op) ///< [in] Reduction operator + { + return ReduceStep(input, folded_items_per_warp, reduction_op, Int2Type<0>()); + } + + + /** + * Segmented reduction + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op) ///< [in] Reduction operator + { + return SegmentedReduce(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>()); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_shfl.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_shfl.cuh new file mode 100644 index 0000000..f0deb8d --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_shfl.cuh @@ -0,0 +1,656 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../util_type.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + * + * LOGICAL_WARP_THREADS must be a power-of-two + */ +template < + typename T, ///< Data type being scanned + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpScanShfl +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + SHFL_C = ((0xFFFFFFFFU << STEPS) & 31) << 8, + }; + + template + struct IntegerTraits + { + enum { + ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange + IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) + }; + }; + + /// Shared memory storage layout type + struct TempStorage {}; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + unsigned int lane_id; + + unsigned int member_mask; + + //--------------------------------------------------------------------- + // Construction + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ WarpScanShfl( + TempStorage &/*temp_storage*/) + : + lane_id(LaneId()), + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ? + 0 : // arch-width subwarps need not be tiled within the arch-warp + ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) + {} + + + //--------------------------------------------------------------------- + // Inclusive scan steps + //--------------------------------------------------------------------- + + /// Inclusive prefix scan step (specialized for summation across int32 types) + __device__ __forceinline__ int InclusiveScanStep( + int input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + int output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .s32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.s32 r0, r0, %4;" + " mov.s32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .s32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.s32 r0, r0, %4;" + " mov.s32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); +#endif + + return output; + } + + /// Inclusive prefix scan step (specialized for summation across uint32 types) + __device__ __forceinline__ unsigned int InclusiveScanStep( + unsigned int input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned int output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across fp32 types) + __device__ __forceinline__ float InclusiveScanStep( + float input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + float output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across unsigned long long types) + __device__ __forceinline__ unsigned long long InclusiveScanStep( + unsigned long long input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned long long output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" + " mov.b64 r0, {lo, hi};" + " @p add.u64 r0, r0, %4;" + " mov.u64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.u64 r0, r0, %4;" + " mov.u64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across long long types) + __device__ __forceinline__ long long InclusiveScanStep( + long long input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + long long output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .s64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" + " mov.b64 r0, {lo, hi};" + " @p add.s64 r0, r0, %4;" + " mov.s64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .s64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.s64 r0, r0, %4;" + " mov.s64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across fp64 types) + __device__ __forceinline__ double InclusiveScanStep( + double input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + double output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + +/* + /// Inclusive prefix scan (specialized for ReduceBySegmentOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePairInclusiveScanStep( + KeyValuePair input, ///< [in] Calling thread's input item. + ReduceBySegmentOp scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key > 0) + output.value = input.value; + + return output; + } +*/ + + /// Inclusive prefix scan step (generic) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + _T temp = ShuffleUp(input, offset, first_lane, member_mask); + + // Perform scan op if from a valid peer + _T output = scan_op(temp, input); + if (static_cast(lane_id) < first_lane + offset) + output = input; + + return output; + } + + + /// Inclusive prefix scan step (specialized for small integers size 32b or less) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer + { + return InclusiveScanStep(input, scan_op, first_lane, offset); + } + + + /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer + { + return InclusiveScanStep(input, scan_op, first_lane, offset); + } + + //--------------------------------------------------------------------- + // Templated inclusive scan iteration + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void InclusiveScanStep( + _T& input, ///< [in] Calling thread's input item. + ScanOp scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + Int2Type /*step*/) ///< [in] Marker type indicating scan step + { + input = InclusiveScanStep(input, scan_op, first_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); + + InclusiveScanStep(input, scan_op, first_lane, Int2Type()); + } + + template + __device__ __forceinline__ void InclusiveScanStep( + _T& /*input*/, ///< [in] Calling thread's input item. + ScanOp /*scan_op*/, ///< [in] Binary scan operator + int /*first_lane*/, ///< [in] Index of first lane in segment + Int2Type /*step*/) ///< [in] Marker type indicating scan step + {} + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Broadcast + //--------------------------------------------------------------------- + + /// Broadcast + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + return ShuffleIndex(input, src_lane, LOGICAL_WARP_THREADS, member_mask); + } + + + //--------------------------------------------------------------------- + // Inclusive operations + //--------------------------------------------------------------------- + + /// Inclusive scan + template + __device__ __forceinline__ void InclusiveScan( + _T input, ///< [in] Calling thread's input item. + _T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOpT scan_op) ///< [in] Binary scan operator + { + inclusive_output = input; + + // Iterate scan steps + int segment_first_lane = 0; + + // Iterate scan steps +// InclusiveScanStep(inclusive_output, scan_op, segment_first_lane, Int2Type<0>()); + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + inclusive_output = InclusiveScanStep( + inclusive_output, + scan_op, + segment_first_lane, + (1 << STEP), + Int2Type::IS_SMALL_UNSIGNED>()); + } + + } + + /// Inclusive scan, specialized for reduce-value-by-key + template + __device__ __forceinline__ void InclusiveScan( + KeyValuePair input, ///< [in] Calling thread's input item. + KeyValuePair &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ReduceByKeyOp scan_op) ///< [in] Binary scan operator + { + inclusive_output = input; + + KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask); + + unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask); + + // Mask away all lanes greater than ours + ballot = ballot & LaneMaskLe(); + + // Find index of first set bit + int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot)); + + // Iterate scan steps +// InclusiveScanStep(inclusive_output.value, scan_op.op, segment_first_lane, Int2Type<0>()); + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + inclusive_output.value = InclusiveScanStep( + inclusive_output.value, + scan_op.op, + segment_first_lane, + (1 << STEP), + Int2Type::IS_SMALL_UNSIGNED>()); + } + } + + + /// Inclusive scan with aggregate + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOpT scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, scan_op); + + // Grab aggregate from last warp lane + warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask); + } + + + //--------------------------------------------------------------------- + // Get exclusive from inclusive + //--------------------------------------------------------------------- + + /// Update inclusive and exclusive using input and inclusive + template + __device__ __forceinline__ void Update( + T /*input*/, ///< [in] + T &inclusive, ///< [in, out] + T &exclusive, ///< [out] + ScanOpT /*scan_op*/, ///< [in] + IsIntegerT /*is_integer*/) ///< [in] + { + // initial value unknown + exclusive = ShuffleUp(inclusive, 1, 0, member_mask); + } + + /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update( + T input, + T &inclusive, + T &exclusive, + cub::Sum /*scan_op*/, + Int2Type /*is_integer*/) + { + // initial value presumed 0 + exclusive = inclusive - input; + } + + /// Update inclusive and exclusive using initial value using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = ShuffleUp(inclusive, 1, 0, member_mask); + + unsigned int segment_id = (IS_ARCH_WARP) ? + lane_id : + lane_id % LOGICAL_WARP_THREADS; + + if (segment_id == 0) + exclusive = initial_value; + } + + /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + cub::Sum scan_op, + T initial_value, + Int2Type /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = inclusive - input; + } + + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive + template + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + IsIntegerT is_integer) + { + warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask); + Update(input, inclusive, exclusive, scan_op, is_integer); + } + + /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + T initial_value, + IsIntegerT is_integer) + { + warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask); + Update(input, inclusive, exclusive, scan_op, initial_value, is_integer); + } + + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_smem.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_smem.cuh new file mode 100644 index 0000000..c3a7a94 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_smem.cuh @@ -0,0 +1,397 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../thread/thread_load.cuh" +#include "../../thread/thread_store.cuh" +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being scanned + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpScanSmem +{ + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The number of threads in half a warp + HALF_WARP_THREADS = 1 << (STEPS - 1), + + /// The number of shared memory elements per warp + WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, + }; + + /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars) + typedef typename If<((Equals::VALUE || Equals::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT; + + /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) + typedef CellT _TempStorage[WARP_SMEM_ELEMENTS]; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + unsigned int lane_id; + unsigned int member_mask; + + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpScanSmem( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS), + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? + 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp + ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) + {} + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Basic inclusive scan iteration (template unrolled, inductive-case specialization) + template < + bool HAS_IDENTITY, + int STEP, + typename ScanOp> + __device__ __forceinline__ void ScanStep( + T &partial, + ScanOp scan_op, + Int2Type /*step*/) + { + const int OFFSET = 1 << STEP; + + // Share partial into buffer + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); + + WARP_SYNC(member_mask); + + // Update partial if addend is in range + if (HAS_IDENTITY || (lane_id >= OFFSET)) + { + T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); + partial = scan_op(addend, partial); + } + WARP_SYNC(member_mask); + + ScanStep(partial, scan_op, Int2Type()); + } + + + /// Basic inclusive scan iteration(template unrolled, base-case specialization) + template < + bool HAS_IDENTITY, + typename ScanOp> + __device__ __forceinline__ void ScanStep( + T &/*partial*/, + ScanOp /*scan_op*/, + Int2Type /*step*/) + {} + + + /// Inclusive prefix scan (specialized for summation across primitive types) + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + Sum scan_op, ///< [in] Binary scan operator + Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type + { + T identity = 0; + ThreadStore(&temp_storage[lane_id], (CellT) identity); + + WARP_SYNC(member_mask); + + // Iterate scan steps + output = input; + ScanStep(output, scan_op, Int2Type<0>()); + } + + + /// Inclusive prefix scan + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type + { + // Iterate scan steps + output = input; + ScanStep(output, scan_op, Int2Type<0>()); + } + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Broadcast + //--------------------------------------------------------------------- + + /// Broadcast + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + if (lane_id == src_lane) + { + ThreadStore(temp_storage, (CellT) input); + } + + WARP_SYNC(member_mask); + + return (T)ThreadLoad(temp_storage); + } + + + //--------------------------------------------------------------------- + // Inclusive operations + //--------------------------------------------------------------------- + + /// Inclusive scan + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InclusiveScan(input, inclusive_output, scan_op, Int2Type::PRIMITIVE>()); + } + + + /// Inclusive scan with aggregate + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, scan_op); + + // Retrieve aggregate + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + + WARP_SYNC(member_mask); + } + + + //--------------------------------------------------------------------- + // Get exclusive from inclusive + //--------------------------------------------------------------------- + + /// Update inclusive and exclusive using input and inclusive + template + __device__ __forceinline__ void Update( + T /*input*/, ///< [in] + T &inclusive, ///< [in, out] + T &exclusive, ///< [out] + ScanOpT /*scan_op*/, ///< [in] + IsIntegerT /*is_integer*/) ///< [in] + { + // initial value unknown + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + } + + /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update( + T input, + T &inclusive, + T &exclusive, + cub::Sum /*scan_op*/, + Int2Type /*is_integer*/) + { + // initial value presumed 0 + exclusive = inclusive - input; + } + + /// Update inclusive and exclusive using initial value using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + if (lane_id == 0) + exclusive = initial_value; + } + + /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + cub::Sum scan_op, + T initial_value, + Int2Type /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = inclusive - input; + } + + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT /*scan_op*/, + IsIntegerT /*is_integer*/) + { + // Initial value presumed to be unknown or identity (either way our padding is correct) + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + } + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + cub::Sum /*scan_o*/, + Int2Type /*is_integer*/) + { + // Initial value presumed to be unknown or identity (either way our padding is correct) + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + exclusive = inclusive - input; + } + + /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + // Broadcast warp aggregate + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + + WARP_SYNC(member_mask); + + // Update inclusive with initial value + inclusive = scan_op(initial_value, inclusive); + + // Get exclusive from exclusive + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 2]); + + if (lane_id == 0) + exclusive = initial_value; + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_reduce.cuh new file mode 100644 index 0000000..ef78dd6 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_reduce.cuh @@ -0,0 +1,612 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "specializations/warp_reduce_shfl.cuh" +#include "specializations/warp_reduce_smem.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup WarpModule + * @{ + */ + +/** + * \brief The WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png) + * + * \tparam T The reduction input/output element type + * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20). + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a list of input elements. + * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads) + * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS + * + * \par Performance Considerations + * - Uses special instructions when applicable (e.g., warp \p SHFL instructions) + * - Uses synchronization-free communication between warp lanes when applicable + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic reduction) + * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS + * + * \par Simple Examples + * \warpcollective{WarpReduce} + * \par + * The code snippet below illustrates four concurrent warp sum reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, + * \p 2544, and \p 3568, respectively (and is undefined in other threads). + * + * \par + * The code snippet below illustrates a single warp sum reduction within a block of + * 128 threads. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * ... + * + * // Only the first warp performs a reduction + * if (threadIdx.x < 32) + * { + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sum to lane0 + * int aggregate = WarpReduce(temp_storage).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the warp of threads is {0, 1, 2, 3, ..., 31}. + * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads). + * + */ +template < + typename T, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpReduce +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + }; + +public: + + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) + typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), + WarpReduceShfl, + WarpReduceSmem >::Type InternalWarpReduce; + + #endif // DOXYGEN_SHOULD_SKIP_THIS + + +private: + + /// Shared memory storage layout type for WarpReduce + typedef typename InternalWarpReduce::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + +public: + + /// \smemstorage{WarpReduce} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. + */ + __device__ __forceinline__ WarpReduce( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()) + {} + + + //@} end member group + /******************************************************************//** + * \name Summation reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a warp-wide sum in the calling warp. The output is valid in warp lane0. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp sum reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sums to each lane0 + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, + * \p 2544, and \p 3568, respectively (and is undefined in other threads). + * + */ + __device__ __forceinline__ T Sum( + T input) ///< [in] Calling thread's input + { + return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, cub::Sum()); + } + + /** + * \brief Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane0. + * + * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction within a single, partially-full + * block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(int *d_data, int valid_items) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item per thread if in range + * int thread_data; + * if (threadIdx.x < valid_items) + * thread_data = d_data[threadIdx.x]; + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).Sum( + * thread_data, valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items + * is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is + * undefined in other threads). + * + */ + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) + { + // Determine if we don't need bounds checking + return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, cub::Sum()); + } + + + /** + * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a head-segmented warp sum + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int head_flag = ... + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).HeadSegmentedSum( + * thread_data, head_flag); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p head_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + * + */ + template < + typename FlagT> + __device__ __forceinline__ T HeadSegmentedSum( + T input, ///< [in] Calling thread's input + FlagT head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment + { + return HeadSegmentedReduce(input, head_flag, cub::Sum()); + } + + + /** + * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a tail-segmented warp sum + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int tail_flag = ... + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).TailSegmentedSum( + * thread_data, tail_flag); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p tail_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename FlagT> + __device__ __forceinline__ T TailSegmentedSum( + T input, ///< [in] Calling thread's input + FlagT tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment + { + return TailSegmentedReduce(input, tail_flag, cub::Sum()); + } + + + + //@} end member group + /******************************************************************//** + * \name Generic reductions + *********************************************************************/ + //@{ + + /** + * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp max reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide reductions to each lane0 + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Reduce( + * thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63, + * \p 95, and \p 127, respectively (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, reduction_op); + } + + /** + * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. + * + * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction within a single, partially-full + * block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(int *d_data, int valid_items) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item per thread if in range + * int thread_data; + * if (threadIdx.x < valid_items) + * thread_data = d_data[threadIdx.x]; + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).Reduce( + * thread_data, cub::Max(), valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items + * is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is + * undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op, ///< [in] Binary reduction operator + int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) + { + return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, reduction_op); + } + + + /** + * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a head-segmented warp max + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int head_flag = ... + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce( + * thread_data, head_flag, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p head_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename ReductionOp, + typename FlagT> + __device__ __forceinline__ T HeadSegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment + ReductionOp reduction_op) ///< [in] Reduction operator + { + return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); + } + + + /** + * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a tail-segmented warp max + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int tail_flag = ... + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).TailSegmentedReduce( + * thread_data, tail_flag, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p tail_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename ReductionOp, + typename FlagT> + __device__ __forceinline__ T TailSegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment + ReductionOp reduction_op) ///< [in] Reduction operator + { + return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); + } + + + + //@} end member group +}; + +/** @} */ // end group WarpModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_scan.cuh new file mode 100644 index 0000000..3f78ca8 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_scan.cuh @@ -0,0 +1,936 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "specializations/warp_scan_shfl.cuh" +#include "specializations/warp_scan_smem.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup WarpModule + * @{ + */ + +/** + * \brief The WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. ![](warp_scan_logo.png) + * + * \tparam T The scan input/output element type + * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20). + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output list where each element is computed to be the reduction + * of the elements occurring earlier in the input list. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * - Supports non-commutative scan operators + * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads) + * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS + * + * \par Performance Considerations + * - Uses special instructions when applicable (e.g., warp \p SHFL) + * - Uses synchronization-free communication between warp lanes when applicable + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic scan) + * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS + * + * \par Simple Examples + * \warpcollective{WarpScan} + * \par + * The code snippet below illustrates four concurrent warp prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, 3, ..., 31}. + * + * \par + * The code snippet below illustrates a single warp prefix sum within a block of + * 128 threads. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for one warp + * __shared__ typename WarpScan::TempStorage temp_storage; + * ... + * + * // Only the first warp performs a prefix sum + * if (threadIdx.x < 32) + * { + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute warp-wide prefix sums + * WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the warp of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data will be {0, 1, 2, 3, ..., 31}. + * + */ +template < + typename T, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpScan +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), + + /// Whether the data type is an integer (which has fully-associative addition) + IS_INTEGER = ((Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER)) + }; + + /// Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) + typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), + WarpScanShfl, + WarpScanSmem >::Type InternalWarpScan; + + /// Shared memory storage layout type for WarpScan + typedef typename InternalWarpScan::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + unsigned int lane_id; + + + + /****************************************************************************** + * Public types + ******************************************************************************/ + +public: + + /// \smemstorage{WarpScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. + */ + __device__ __forceinline__ WarpScan( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix sums + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive prefix sum across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 1, 2, 3, ..., 32}. + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item. + T &inclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. + { + InclusiveScan(input, inclusive_output, cub::Sum()); + } + + + /** + * \brief Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix sums + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 1, 2, 3, ..., 32}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sums + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. + * + * \par + * - \identityzero + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, ..., 31}. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item. + T &exclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. + { + T initial_value = 0; + ExclusiveScan(input, exclusive_output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \identityzero + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix sums + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, ..., 31}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + T initial_value = 0; + ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scans + *********************************************************************/ + //@{ + + /** + * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op); + } + + + /** + * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveScan( + * thread_data, thread_data, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix scans + *********************************************************************/ + //@{ + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + initial_value, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + warp_aggregate, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + warp_aggregate, + scan_op, + initial_value, + Int2Type()); + } + + + //@} end member group + /******************************************************************//** + * \name Combination (inclusive & exclusive) prefix scans + *********************************************************************/ + //@{ + + + /** + * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p exclusive_output computed for warp-lane0 is undefined. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int inclusive_partial, exclusive_partial; + * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p inclusive_partial in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * The corresponding output \p exclusive_partial in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * int inclusive_partial, exclusive_partial; + * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p inclusive_partial in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * The corresponding output \p exclusive_partial in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + initial_value, + Int2Type()); + } + + + + //@} end member group + /******************************************************************//** + * \name Data exchange + *********************************************************************/ + //@{ + + /** + * \brief Broadcast the value \p input from warp-lanesrc_lane to all lanes in the warp + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the warp-wide broadcasts of values from + * lanes0 in each of four warps to all other threads in those warps. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Broadcast from lane0 in each warp to all other threads in the warp + * int warp_id = threadIdx.x / 32; + * thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p thread_data will be + * {0, 0, ..., 0} in warp0, + * {32, 32, ..., 32} in warp1, + * {64, 64, ..., 64} in warp2, etc. + */ + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + return InternalWarpScan(temp_storage).Broadcast(input, src_lane); + } + + //@} end member group + +}; + +/** @} */ // end group WarpModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/example.py b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/example.py new file mode 100644 index 0000000..fedab49 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/example.py @@ -0,0 +1,31 @@ +# Compilation cmd: python setup.py install +# Alternative JIT compilation, refer to https://pytorch.org/tutorials/advanced/cpp_extension.html#jit-compiling-extensions +# from torch.utils.cpp_extension import load +# rdxtopk = load(name="rdxtopk", sources=["rdxtopk.cpp", "rdxtopk_cuda.cu"], verbose=True) + +import torch +import rdxtopk + + +size = 500000 +k = int(size * 0.01) +t = torch.randint(100000, [size], device='cuda').float() + torch.rand(size, dtype=torch.float32, device='cuda') + +indices = torch.arange(t.numel()).cuda().int() + +print("===== original tensor ======") +print(t) + +print("\n\n===== RadixTopK values not sorted ======") +out1, out2 = rdxtopk.topk(t, indices, k) +print(out1, out2) + +print("\n\n===== RadixTopK values sorted ======") +print(out1.sort(descending=True)[0], out2[out1.sort(descending=True)[1]]) + +print("\n\n===== Torch TopK values sorted ======") +out3, out4 = torch.topk(t, k) +print(out3, out4) + + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk.cpp b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk.cpp new file mode 100644 index 0000000..a7c7097 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk.cpp @@ -0,0 +1,18 @@ +#include + +#include + +// CUDA forward declarations + +std::vector rdxtopk_cuda(torch::Tensor input,torch::Tensor indices, unsigned int k); + + +// C++ interface +std::vector topk(torch::Tensor input,torch::Tensor indices, unsigned int k) { + return rdxtopk_cuda(input,indices, k); +} + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("topk", &topk, "Radix TopK Selection"); +} diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk_cuda.cu b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk_cuda.cu new file mode 100644 index 0000000..409fc4f --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk_cuda.cu @@ -0,0 +1,582 @@ +#include + +#include +#include +#include + +#include +#include + +#include "cub/device/device_radix_sort.cuh" +#include "cub/util_allocator.cuh" + +#include +#include +#include +#include +#include +#include +#include + +// #include "printFunctions.cuh" +// #include "generateProblems.cuh" +// #include "topk.h" + +using namespace std; +using namespace cub; +#define maxThreadsPerBlock 1024 + + +/** + * Computes the histogram over the digit values of an array of keys that MUST have a length of an integer multiple of (KPT * blockDim.x). + * The padding to the integer multiple can be done by adding 0's at the end and subtracting the number of padded 0's from the final result's 0 bin. + * The 2^NUM_BITS possible counts (0..2^NUM_BITSNUM_BITS-1) will be placed in global_histo. + * @param keys [IN] The keys for which to compute the histogram + * @param digit [IN] + * @param global_histo [OUT] The array of element counts, MUST be 256 in size. + * @param per_block_histo [OUT] + */ + template< + typename KeyT, // Data type of the keys within device memory. Data will be twiddled (if necessary) to unsigned type + typename IndexT, // Data type used for key's offsets and counters (limits number of supported keys, uint = 2^32) + int NUM_BITS, // Number of bits being sorted at a time + int KPT, // Number of keys per thread + int TPB, // Number of threads per block + int PRE_SORT_RUNS_LENGTH // For values greater than 1, this causes to sort a thread's keys by runs of a given length to improve run-length encoded updates to shared memory. +> +__global__ void rdxsrt_histogram(KeyT *__restrict__ keys, const uint digit, IndexT *global_histo) +{ + /*** TYPEDEFs***/ + typedef Traits KeyTraits; + typedef typename KeyTraits::UnsignedBits UnsignedBits; + /*typedef LoadUnit KeyLoader;*/ + + /*** DECLARATIONS ***/ + UnsignedBits tloc_keys[KPT]; // local keys in a thread + uint tloc_masked[KPT]; + __shared__ uint shared_bins[0x01<(keys, tloc_keys); + #pragma unroll + for (int i=0; i(keys)[block_offset + threadIdx.x + blockDim.x * i]; + } + +#if true || USE_RLE_HISTO + // Mask + #pragma unroll + for (int i=0; i>((sizeof(KeyT)*8)-(NUM_BITS*(digit+1))))&((0x01<>((sizeof(KeyT)*8)-(NUM_BITS*(digit+1))))&((0x01< +__global__ void rdxsrt_histogram_with_guards(KeyT *__restrict__ keys, const uint digit, IndexT *global_histo, const IndexT total_keys, const int block_index_offset) +{ + /*** TYPEDEFs***/ + typedef Traits KeyTraits; + typedef typename KeyTraits::UnsignedBits UnsignedBits; + /*typedef LoadUnit KeyLoader;*/ + + /*** DECLARATIONS ***/ + UnsignedBits tloc_keys[KPT]; + uint tloc_masked[KPT]; + __shared__ uint shared_bins[(0x01<(keys, tloc_keys, block_max_num_keys); + #pragma unroll + for (int i=0; i(keys)[block_offset + threadIdx.x + blockDim.x * i]; + } + } + + #pragma unroll + for(int i=0; i>((sizeof(KeyT)*8)-(NUM_BITS*(digit+1))))&((0x01< highest digit, 3 => lowest digit for 32-bit) +* @param digit_val [IN] Digit value. +* @param num_items [IN] Number of entries. +* @param d_keys_buffer [OUT] Entries with x[digit] = digit_val. +* @param d_keys_out [OUT] Entries with x[digit] > digit_val. +* @param d_values_buffer [OUT] Entry values with x[digit] = digit_val. +* @param d_values_out [OUT] Entry values with x[digit] > digit_val. +* @param d_index_buffer [OUT] Index into d_keys_buffer. +* @param d_index_out [OUT] Index into d_keys_out. +*/ +template< + typename KeyT, // Data type of the keys within device memory. Data will be twiddled (if necessary) to unsigned type + typename IndexT, // Data type used for key's offsets and counters (limits number of supported keys, uint = 2^32) + int NUM_BITS, // Number of bits being sorted at a time + int KPT, // Number of keys per thread + int TPB // Number of threads per block +> +__global__ void select_kth_bucket(KeyT* d_keys_in, unsigned int* d_values_in, const uint digit, const uint digit_val, uint num_items, + KeyT* d_keys_buffer, KeyT* d_keys_out, unsigned int* d_values_buffer, unsigned int* d_values_out, uint* d_index_buffer, uint* d_index_out) +{ + typedef Traits KeyTraits; + typedef typename KeyTraits::UnsignedBits UnsignedBits; + + // Specialize BlockLoad for a 1D block of TPB threads owning KPT integer items each + typedef cub::BlockLoad BlockLoadT; + + // Specialize BlockScan type for our thread block + typedef BlockScan BlockScanT; + + // in some sense, tile means block + const int tile_size = TPB * KPT; + int tile_idx = blockIdx.x; // Current tile index + int tile_offset = tile_idx * tile_size; + + // Allocate shared memory for BlockLoad + __shared__ union TempStorage + { + typename BlockLoadT::TempStorage load_items; + typename BlockScanT::TempStorage scan; + int offset[1]; + UnsignedBits raw_exchange[2 * TPB * KPT]; + } temp_storage; + + // Load a segment of consecutive items that are blocked across threads + UnsignedBits key_entries[KPT]; + unsigned int value_entries[KPT]; + + /*float payload_entries[KPT];*/ + int selection_flags[KPT]; + int selection_indices[KPT]; + + int num_tiles = (num_items + tile_size - 1) / tile_size; + int num_tile_items = tile_size; + bool is_last_tile = false; + if (tile_idx == num_tiles - 1) { + num_tile_items = num_items - tile_offset; + is_last_tile = true; + } + + // Load keys and values + if (is_last_tile) { + BlockLoadT(temp_storage.load_items).Load(reinterpret_cast(d_keys_in) + tile_offset, key_entries, num_tile_items); + __syncthreads(); + BlockLoadT(temp_storage.load_items).Load(reinterpret_cast(d_values_in) + tile_offset, value_entries, num_tile_items); + } + else { + BlockLoadT(temp_storage.load_items).Load(reinterpret_cast(d_keys_in) + tile_offset, key_entries); + __syncthreads(); + BlockLoadT(temp_storage.load_items).Load(reinterpret_cast(d_values_in) + tile_offset, value_entries); + } + + __syncthreads(); + + /*** Step 1: Find keys with digit value to selected digit value ***/ + #pragma unroll + for (int ITEM = 0; ITEM < KPT; ++ITEM) + { + // Out-of-bounds items are selection_flags + selection_flags[ITEM] = 0; + + if (!is_last_tile || (int(threadIdx.x * KPT) + ITEM < num_tile_items)) { + UnsignedBits key = KeyTraits::TwiddleIn(key_entries[ITEM]); + uint masked_key = (key>>((sizeof(KeyT)*8)-(NUM_BITS*(digit+1))))&((0x01< digit_val); + } + } + + __syncthreads(); + + // Compute exclusive prefix sum + int num_selected; + BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_selected); + + __syncthreads(); + + if (num_selected > 0) { + int index_out; + if (threadIdx.x == 0) { + // Find index into keys_out array + index_out = atomicAdd(d_index_out, num_selected); + temp_storage.offset[0] = index_out; + } + + __syncthreads(); + + index_out = temp_storage.offset[0]; + + __syncthreads(); + + // Compact and scatter items + #pragma unroll + for (int ITEM = 0; ITEM < KPT; ++ITEM) + { + int local_scatter_offset = selection_indices[ITEM]; + if (selection_flags[ITEM]) + { + temp_storage.raw_exchange[local_scatter_offset] = key_entries[ITEM]; + temp_storage.raw_exchange[tile_size + local_scatter_offset] = value_entries[ITEM]; + /*temp_storage.raw_exchange[tile_size + local_scatter_offset] = payload_entries[ITEM];*/ + } + } + + __syncthreads(); + + // Write out matched entries to output array + for (int item = threadIdx.x; item < num_selected; item += TPB) + { + reinterpret_cast(d_keys_out)[index_out + item] = temp_storage.raw_exchange[item]; + d_values_out[index_out + item] = temp_storage.raw_exchange[tile_size + item]; + } + + __syncthreads(); + +#if 0 + for (int item = threadIdx.x; item < num_selected; item += TPB) + { + payload_out[num_selections_prefix + item] = temp_storage.raw_exchange[tile_size + item]; + } +#endif + } + + /*** Step 2: Find entries that have digit equal to digit value ***/ + #pragma unroll + for (int ITEM = 0; ITEM < KPT; ++ITEM) + { + // Out-of-bounds items are selection_flags + selection_flags[ITEM] = 0; + + if (!is_last_tile || (int(threadIdx.x * KPT) + ITEM < num_tile_items)) { + UnsignedBits key = KeyTraits::TwiddleIn(key_entries[ITEM]); + uint masked_key = (key>>((sizeof(KeyT)*8)-(NUM_BITS*(digit+1))))&((0x01< 0) { + int index_buffer; + if (threadIdx.x == 0) { + index_buffer = atomicAdd(d_index_buffer, num_selected); + temp_storage.offset[0] = index_buffer; + } + + __syncthreads(); + + index_buffer = temp_storage.offset[0]; + + __syncthreads(); + + // Compact and scatter items + #pragma unroll + for (int ITEM = 0; ITEM < KPT; ++ITEM) + { + int local_scatter_offset = selection_indices[ITEM]; + if (selection_flags[ITEM]) + { + temp_storage.raw_exchange[local_scatter_offset] = key_entries[ITEM]; + temp_storage.raw_exchange[tile_size + local_scatter_offset] = value_entries[ITEM]; + /*temp_storage.raw_exchange[tile_size + local_scatter_offset] = payload_entries[ITEM];*/ + } + } + + __syncthreads(); + + // Write out output entries + for (int item = threadIdx.x; item < num_selected; item += TPB) + { + reinterpret_cast(d_keys_buffer)[index_buffer + item] = temp_storage.raw_exchange[item]; + d_values_buffer[index_buffer + item] = temp_storage.raw_exchange[tile_size + item]; + } + + __syncthreads(); + } +} + +__global__ void set_index_array(unsigned int* array, unsigned int len) { + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + + while (i < len) { + array[i] = i; + i += gridSize; + } +} + + +#define KPT 16 +#define TPB 384 +#define DIGIT_BITS 8 +cudaError_t CUDARadixSelectTopK(torch::Tensor d_keys_in, + torch::Tensor d_indices_in, + unsigned int num_items, + unsigned int k, + float *d_keys_out, + unsigned int *d_values_out) { + + cudaError error = cudaSuccess; + + // get helper buffers + // unsigned int *d_histogram = buf->histogram; + // unsigned int *d_index_out = buf->index_out; + // unsigned int *d_index_buffer = buf->index_buffer; + + // float* keys_double_buffer[2] = {buf->keys_buffer0, buf->keys_buffer1}; + // unsigned int* values_double_buffer[2] = {buf->value_buffer0, buf->value_buffer1}; + unsigned char current_keys_buffer = 0; + + //initialize buffer with empty tensor + //unsigned int *d_histogram = (uint*)torch::zeros(256*128, torch::TensorOptions().dtype(torch::kInt).device(d_keys_in.device())).data_ptr(); + //unsigned int *d_index_out = (uint*)torch::zeros(128, torch::TensorOptions().dtype(torch::kInt).device(d_keys_in.device())).data_ptr(); + //unsigned int *d_index_buffer = (uint*)torch::zeros(128, torch::TensorOptions().dtype(torch::kInt).device(d_keys_in.device())).data_ptr(); + unsigned int *d_histogram, *d_index_out, *d_index_buffer; + cudaMalloc(&d_histogram, 256*128); + cudaMalloc(&d_index_out, 128); + cudaMalloc(&d_index_buffer, 128); + + torch::Tensor keys_double_tensor[2] = {d_keys_in.clone(), d_keys_in.clone()}; + torch::Tensor indices_double_tensor[2] = {d_indices_in.clone(), d_indices_in.clone()}; + float* keys_double_buffer[2] = {(float*)keys_double_tensor[0].data_ptr(), (float*)keys_double_tensor[1].data_ptr()}; + unsigned int* values_double_buffer[2] = {(unsigned int*)indices_double_tensor[0].data_ptr(), (unsigned int*)indices_double_tensor[1].data_ptr()}; + //float* keys_double_buffer[2] = {(float*)d_keys_in.clone().data_ptr(), + // (float*)d_keys_in.clone().data_ptr()}; + //unsigned int* values_double_buffer[2] = {(uint*)d_indices_in.clone().data_ptr(), + // (uint*)d_indices_in.clone().data_ptr()}; + + // Set the index into output array to 0. + cudaMemset(d_index_out, 0, 4); + + unsigned int KPB = KPT * TPB; + + unsigned int *h_histogram = new unsigned int[256]; + + // set value array (index) + // int blocksPerGrid = (int) ceil(1.0 * num_items / TPB); + // set_index_array<<>>(values_double_buffer[current_keys_buffer], num_items); + + + // enumerate each digit (32-bit data (float32) / 8-bit/pass, so that's 4 digit in total) + for (unsigned int digit = 0; digit < 4; digit++) { + unsigned int num_blocks = num_items / KPB;// Pass-0 rough processing blocks (floor on purpose) + unsigned int processed_elements = num_blocks * KPB;// Pass-0 number of rough processed elements + unsigned int remaining_elements = num_items - processed_elements;// Do the remaining elements with a check in the inner loop + unsigned int remainder_blocks = (KPB - 1 + remaining_elements) / KPB;// Number of blocks required for remaining elements (typically 0 or 1) + + /******************************************************************************************/ + /* Caluclate Histogram */ + /******************************************************************************************/ + // Zero out the histogram + cudaMemset(d_histogram, 0, 256 * sizeof(int)); + + float* d_current_keys_in = keys_double_buffer[current_keys_buffer]; + unsigned int* d_current_value_in = values_double_buffer[current_keys_buffer]; + if (num_blocks > 0) + rdxsrt_histogram<<>>(d_current_keys_in, digit, d_histogram); + if (remaining_elements > 0) + rdxsrt_histogram_with_guards<<>>(d_current_keys_in, digit, d_histogram, num_items, num_blocks); + + /******************************************************************************************/ + /* Find the bin which contains the Kth largest element */ + /******************************************************************************************/ + cudaMemcpy(h_histogram, d_histogram, 256 * sizeof(uint), cudaMemcpyDeviceToHost); + + // currently we find the bin on host, hence we need to synchronize the stream + // cudaStreamSynchronize(stream); + + unsigned int rolling_sum = 0; + unsigned int digit_val; + for (int i = 255; i >= 0; i--) { + if ((rolling_sum + h_histogram[i]) > k) { + digit_val = i; + k -= rolling_sum; + break; + } + rolling_sum += h_histogram[i]; + } + + cudaMemset(d_index_buffer, 0, 4); + select_kth_bucket<<>>(d_current_keys_in, + d_current_value_in, + digit, + digit_val, + num_items, + keys_double_buffer[1-current_keys_buffer], + d_keys_out, + values_double_buffer[1-current_keys_buffer], + d_values_out, + d_index_buffer, + d_index_out); + uint h_index_out; + uint h_index_buffer; + + cudaMemcpy(&h_index_out, d_index_out, sizeof(uint), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_index_buffer, d_index_buffer, sizeof(uint), cudaMemcpyDeviceToHost); + + // cudaStreamSynchronize(stream); + // Update number of items to reflect reduced number of elements. + num_items = h_index_buffer; + + if (k == 0) break; + else if (k != 0 && digit == 3) { + // We are at last digit and k != 4 implies that kth value has repetition. + // Copy any of the repeated values(and keys!) to out array to complete the array. + cudaMemcpy(d_keys_out + h_index_out, keys_double_buffer[1-current_keys_buffer] ,k * sizeof(float), cudaMemcpyDeviceToDevice); + cudaMemcpy(d_values_out + h_index_out, values_double_buffer[1-current_keys_buffer], k * sizeof(float), cudaMemcpyDeviceToDevice); + k -= k; + } + + current_keys_buffer = 1 - current_keys_buffer; + } + delete[] h_histogram; + cudaFree(d_histogram); + cudaFree(d_index_out); + cudaFree(d_index_buffer); +} + + +// __global__ void _Uint32ToInt32(int *dst_data, +// unsigned int *src_data, +// unsigned int n) +// { +// // set thread ID +// unsigned int tid = threadIdx.x; +// unsigned int gridSize = blockDim.x * gridDim.x; +// unsigned int i = blockIdx.x * blockDim.x + tid; +// unsigned int blockSize = blockDim.x; + +// while (i < n) { +// dst_data[i] = (int)src_data[i]; +// i += gridSize; +// } +// } + +// void Uint32ToInt32(int *dst_data, +// unsigned int *src_data, +// unsigned int num_elements) +// { +// int blocksPerGrid = (int) ceil(1.0 * num_elements / maxThreadsPerBlock); +// _Uint32ToInt32<<>>(dst_data, src_data, num_elements); +// } + + + +std::vector rdxtopk_cuda( + torch::Tensor input,torch::Tensor indices, unsigned int k) { + + unsigned int num_items = input.numel(); + + auto d_keys_out = torch::zeros(k, torch::TensorOptions().dtype(torch::kFloat32).device(input.device())); + auto d_values_out = torch::zeros(k, torch::TensorOptions().dtype(torch::kInt).device(input.device())); + + CUDARadixSelectTopK(input,indices, + num_items, + k, + (float*)d_keys_out.data_ptr(), + (uint*)d_values_out.data_ptr()); + + // Uint32ToInt32((int*)d_values_out.data_ptr(), (uint*)d_values_out.data_ptr(), k); + +return {d_keys_out, d_values_out}; +} + + diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/setup.py b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/setup.py new file mode 100644 index 0000000..49de7d7 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/setup.py @@ -0,0 +1,14 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +setup( + name='rdxtopk', + ext_modules=[ + CUDAExtension('rdxtopk', [ + 'rdxtopk.cpp', + 'rdxtopk_cuda.cu', + ]), + ], + cmdclass={ + 'build_ext': BuildExtension + }) diff --git a/untitled folder/grace_dl/dist/compressor/randomk.py b/untitled folder/grace_dl/dist/compressor/randomk.py new file mode 100644 index 0000000..c620945 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/randomk.py @@ -0,0 +1,41 @@ +import torch + +from grace_dl.dist import Compressor + + +def sparsify(tensor, compress_ratio): + tensor = tensor.flatten() + numel = tensor.numel() + k = max(1, int(numel * compress_ratio)) + # indices = torch.randperm(numel, device=tensor.device)[:k] + indices = torch.randint(numel, [k], device=tensor.device) + values = tensor[indices] + return indices, values + + +class RandomKCompressor(Compressor): + """Python libraries Based Compress by performing sparsification (i.e., sending a ratio of the actual tensor size.""" + + def __init__(self, compress_ratio): + super().__init__() + self.global_step = 0 + self.compress_ratio = compress_ratio + + def compress(self, tensor, name): + """Use Python Random libraries RNG to compress by generating a list of indices to be transmitted.""" + + h = sum(bytes(name, encoding='utf8'), self.global_step) + self.global_step += 1 + torch.manual_seed(h) + indices, values = sparsify(tensor, self.compress_ratio) + + ctx = indices, tensor.numel(), tensor.size() + return [values], ctx + + def decompress(self, tensors, ctx): + """Decompress by filling empty slots with zeros and reshape back using the original shape""" + indices, numel, shape = ctx + values, = tensors + tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) + tensor_decompressed.scatter_(0, indices, values) + return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/dist/compressor/signsgd.py b/untitled folder/grace_dl/dist/compressor/signsgd.py new file mode 100644 index 0000000..e2e75eb --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/signsgd.py @@ -0,0 +1,30 @@ +import torch + +from grace_dl.dist import Compressor + + +class SignSGDCompressor(Compressor): + + def __init__(self): + super().__init__(average=False) + + def compress(self, tensor, name): + shape = tensor.size() + tensor = tensor.flatten() + + tensor_compressed = tensor >= 0 + return [tensor_compressed.type(torch.uint8)], shape + + def decompress(self, tensors, shape): + """Decoding the signs to float format """ + sign_encode, = tensors + sign_decode = sign_encode.type(torch.float32) * 2 - 1 + tensor_decompressed = sign_decode.view(shape) + return tensor_decompressed + + def aggregate(self, tensors): + """Aggregate a list of tensors.""" + agged_tensor = sum(tensors) + sign = agged_tensor >= 0 + agged_tensor = sign * 2.0 - 1.0 + return agged_tensor diff --git a/untitled folder/grace_dl/dist/compressor/signum.py b/untitled folder/grace_dl/dist/compressor/signum.py new file mode 100644 index 0000000..2b70bd5 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/signum.py @@ -0,0 +1,37 @@ +import torch + +from grace_dl.dist import Compressor + + +class SignumCompressor(Compressor): + + def __init__(self, momentum): + super().__init__(average=False) + self.momentum = momentum + self.momentums = {} + + def compress(self, tensor, name): + """Encoding and compressing the signs """ + shape = tensor.size() + tensor = tensor.flatten() + + # update tensor by momentum + if name in self.momentums: + tensor = (1.0 - self.momentum) * tensor + self.momentum * self.momentums[name] + self.momentums[name] = tensor + tensor_compressed = tensor >= 0 + return [tensor_compressed.type(torch.uint8)], shape + + def decompress(self, tensors, shape): + sign_encode, = tensors + """Decoding the signs to float format """ + sign_decode = sign_encode.type(torch.float32) * 2 - 1 + tensor_decompressed = sign_decode.view(shape) + return tensor_decompressed + + def aggregate(self, tensors): + """Aggregate a list of tensors.""" + agged_tensor = sum(tensors) + agged_tensor = agged_tensor >= 0 + agged_tensor = agged_tensor * 2.0 - 1.0 + return agged_tensor diff --git a/untitled folder/grace_dl/dist/compressor/terngrad.py b/untitled folder/grace_dl/dist/compressor/terngrad.py new file mode 100644 index 0000000..c5267fc --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/terngrad.py @@ -0,0 +1,32 @@ +import torch +from grace_dl.dist import Compressor + + +class TernGradCompressor(Compressor): + + def compress(self, tensor, name): + shape = tensor.size() + tensor = tensor.flatten() + + std = (tensor - torch.mean(tensor)) ** 2 + std = torch.sqrt(torch.mean(std)) + c = 2.5 * std.item() + gradient = torch.clamp(tensor, -c, c) + abs_gradient = gradient.abs() + scalar = abs_gradient.max() + + sign_gradient = gradient.sign() * scalar + rnd_sample = torch.empty_like(tensor).uniform_(0, 1) * scalar.item() + sign_gradient[rnd_sample >= abs_gradient] = 0 + new_sign = sign_gradient.sign() # -1, 0, 1 + + tensor_compressed = new_sign.char(), scalar.flatten() + return tensor_compressed, shape + + def decompress(self, tensor_compressed, ctx): + sign, scalar = tensor_compressed + shape = ctx + tensor_decompressed = sign.float() * scalar + return tensor_decompressed.view(shape) + + diff --git a/untitled folder/grace_dl/dist/compressor/threshold.py b/untitled folder/grace_dl/dist/compressor/threshold.py new file mode 100644 index 0000000..b0ebac8 --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/threshold.py @@ -0,0 +1,27 @@ +import torch + +from grace_dl.dist import Compressor + + +class ThresholdCompressor(Compressor): + + def __init__(self, threshold): + super().__init__(tensors_size_are_same=False) + self.threshold = threshold + + def compress(self, tensor, name): + shape = tensor.size() + tensor = tensor.flatten() + + indices, = torch.where(tensor.abs() >= min(self.threshold, torch.max(tensor))) + values = tensor[indices] + ctx = shape + return [values, indices.int()], ctx + + def decompress(self, tensor_compressed, ctx): + shape = ctx + numel = shape.numel() + values, indices = tensor_compressed + tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) + tensor_decompressed.scatter_(0, indices.long(), values) + return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/dist/compressor/topk.py b/untitled folder/grace_dl/dist/compressor/topk.py new file mode 100644 index 0000000..6fd55fe --- /dev/null +++ b/untitled folder/grace_dl/dist/compressor/topk.py @@ -0,0 +1,69 @@ +import torch +from torch.utils.dlpack import to_dlpack + +from grace_dl.dist import Compressor + + +def rdxtopk_cuda(tensor, k): + import rdxtopk + # Only applicable to cuda tensors. + device = tensor.device + tensor = tensor.cuda() + indices = torch.arange(tensor.numel(), device=tensor.device).int() + vals, indices = rdxtopk.topk(tensor, indices, k) + return vals.to(device), indices.to(device).long() + + +def cupy_topk(tensor, k): + import cupy # conda install -c conda-forge cupy=7.0.0=py37h0c141eb_2 + # Only applicable to cuda tensors. cupy.partition is way much faster than torch.topk for large cuda tensors. + # Warning!! This is not an exact topk implementation, when there are multiple elements equal to the threshold. + # For float32 gradients, this rarely happens. + device = tensor.device + tensor = tensor.cuda() + cupy_tensor = cupy.fromDlpack(to_dlpack(tensor)) + threshold = -cupy.partition(-cupy_tensor, k)[k] + indices, = torch.where(tensor >= threshold.item()) + indices = indices[:k] + values = tensor[indices] + return values.to(device), indices.to(device).long() + + +def sparsify(tensor, compress_ratio, kernel): + tensor = tensor.flatten() + k = max(1, int(tensor.numel() * compress_ratio)) + if kernel == 'torch': + _, indices = torch.topk(tensor.abs(), k, sorted=False,) + elif kernel =='cupy': + _, indices = cupy_topk(tensor.abs(), k) + elif kernel == 'rdxtopk_cuda': + _, indices = rdxtopk_cuda(tensor.abs(), k) + values = torch.gather(tensor, 0, indices) + return values, indices.int() + + +def desparsify(tensors, numel): + values, indices = tensors + tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) + tensor_decompressed.scatter_(0, indices.long(), values) + return tensor_decompressed + + +class TopKCompressor(Compressor): + + def __init__(self, compress_ratio, kernel='torch'): + super().__init__() + self.compress_ratio = compress_ratio + self.kernel = kernel + + def compress(self, tensor, name): + tensors = sparsify(tensor, self.compress_ratio, self.kernel) + ctx = tensor.size() + return tensors, ctx + + def decompress(self, tensors, ctx): + """Decompress by filling empty slots with zeros and reshape back using the original shape""" + shape = ctx + numel = shape.numel() + tensor_decompressed = desparsify(tensors, numel) + return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/dist/helper.py b/untitled folder/grace_dl/dist/helper.py new file mode 100644 index 0000000..e70b980 --- /dev/null +++ b/untitled folder/grace_dl/dist/helper.py @@ -0,0 +1,102 @@ +def grace_from_params(params): + comp = params.get('compressor', 'none') + mem = params.get('memory', 'none') + comm = params.get('communicator', 'allreduce') + if comp == 'dgc': + from grace_dl.dist.compressor.dgc import DgcCompressor + compress_ratio = params.get('compress_ratio', 0.3) + compressor = DgcCompressor(compress_ratio) + elif comp == 'efsignsgd': + from grace_dl.dist.compressor.efsignsgd import EFSignSGDCompressor + lr = params.get('lr', 0.1) + compressor = EFSignSGDCompressor(lr) + elif comp == 'fp16': + from grace_dl.dist.compressor.fp16 import FP16Compressor + compressor = FP16Compressor() + elif comp == 'natural': + from grace_dl.dist.compressor.natural import NaturalCompressor + compressor = NaturalCompressor() + elif comp == 'natural_cuda': + from grace_dl.dist.compressor.natural import NaturalCompressor_CUDA + compressor = NaturalCompressor_CUDA() + elif comp == 'none': + from grace_dl.dist.compressor.none import NoneCompressor + compressor = NoneCompressor() + elif comp == 'onebit': + from grace_dl.dist.compressor.onebit import OneBitCompressor + compressor = OneBitCompressor() + elif comp == 'powersgd': + from grace_dl.dist.compressor.powersgd import PowerSGDCompressor + compressor = PowerSGDCompressor() + elif comp == 'qsgd': + from grace_dl.dist.compressor.qsgd import QSGDCompressor + quantum_num = params.get('quantum_num', 127) + bucket_size = params.get('bucket_size', 128) + compressor = QSGDCompressor(quantum_num, bucket_size) + elif comp == 'qsgd_cuda': + from grace_dl.dist.compressor.qsgd import QSGDCompressor_CUDA + quantum_num = params.get('quantum_num', 127) + bucket_size = params.get('bucket_size', 128) + compressor = QSGDCompressor_CUDA(quantum_num, bucket_size) + elif comp == 'randomk': + from grace_dl.dist.compressor.randomk import RandomKCompressor + compress_ratio = params.get('compress_ratio', 0.3) + compressor = RandomKCompressor(compress_ratio) + elif comp == 'signsgd': + from grace_dl.dist.compressor.signsgd import SignSGDCompressor + compressor = SignSGDCompressor() + elif comp == 'signum': + from grace_dl.dist.compressor.signum import SignumCompressor + momentum = params.get('momentum', 0.9) + compressor = SignumCompressor(momentum) + elif comp == 'terngrad': + from grace_dl.dist.compressor.terngrad import TernGradCompressor + compressor = TernGradCompressor() + elif comp == 'threshold': + from grace_dl.dist.compressor.threshold import ThresholdCompressor + threshold = params.get('threshold', 0.01) + compressor = ThresholdCompressor(threshold) + elif comp == 'topk': + from grace_dl.dist.compressor.topk import TopKCompressor + compress_ratio = params.get('compress_ratio', 0.3) + kernel = params.get('kernel', 'torch') + compressor = TopKCompressor(compress_ratio, kernel) + else: + raise NotImplementedError(comp) + + if mem == 'dgc': + from grace_dl.dist.memory.dgc import DgcMemory + momentum = params.get('momentum', 0.9) + gradient_clipping = params.get('gradient_clipping', False) + memory = DgcMemory(momentum, gradient_clipping, params['world_size']) + elif mem == 'none': + from grace_dl.dist.memory.none import NoneMemory + memory = NoneMemory() + elif mem == 'powersgd': + from grace_dl.dist.memory.powersgd import PowerSGDMemory + compress_rank = params.get('compress_rank', 1) + memory = PowerSGDMemory(compressor.q_memory, compress_rank) + elif mem == 'residual': + from grace_dl.dist.memory.residual import ResidualMemory + memory = ResidualMemory() + elif mem == 'efsignsgd': + from grace_dl.dist.memory.efsignsgd import EFSignSGDMemory + lr = params.get('lr', 0.1) + memory = EFSignSGDMemory(lr) + else: + raise NotImplementedError(mem) + + if comm == 'allreduce': + from grace_dl.dist.communicator.allreduce import Allreduce + return Allreduce(compressor, memory, params['world_size']) + elif comm == 'allgather': + from grace_dl.dist.communicator.allgather import Allgather + return Allgather(compressor, memory, params['world_size']) + elif comm == 'broadcast': + from grace_dl.dist.communicator.broadcast import Broadcast + return Broadcast(compressor, memory, params['world_size']) + elif comm == 'alltoall': + from grace_dl.dist.communicator.all_to_all import AllToAll + return AllToAll(compressor, memory, params['world_size']) + else: + raise NotImplementedError(comm) diff --git a/untitled folder/grace_dl/dist/memory/dgc.py b/untitled folder/grace_dl/dist/memory/dgc.py new file mode 100644 index 0000000..93cd6e6 --- /dev/null +++ b/untitled folder/grace_dl/dist/memory/dgc.py @@ -0,0 +1,39 @@ +import torch +from torch import distributed as dist + +from grace_dl.dist import Memory + + +class DgcMemory(Memory): + def __init__(self, momentum, gradient_clipping, world_size): + self.gradient_clipping = gradient_clipping + self.momentum = momentum + self.world_size = world_size + self.gradients = {} + self.residuals = {} + + def compensate(self, tensor, name): + """Update the tensor with the residuals.""" + if self.gradient_clipping: + tensor_squ_sum = torch.sum(tensor * tensor) + clipping_val = torch.sqrt(dist.all_reduce(tensor_squ_sum) / self.world_size) + tensor = tensor.clamp(-clipping_val, clipping_val) + if name in self.residuals: + self.residuals[name] = self.momentum * self.residuals[name] + tensor + else: + self.residuals[name] = tensor + if name in self.gradients: + self.gradients[name] += self.residuals[name] + tensor = self.gradients[name] + else: + self.gradients[name] = tensor + return tensor + + def update(self, tensor, name, compressor, tensor_compressed, ctx): + """Update the residuals.""" + shape, mask, _ = ctx + not_mask = ~mask.view(shape) + temp = self.residuals[name] * not_mask + self.residuals[name] = temp + temp = self.gradients[name] * not_mask + self.gradients[name] = temp diff --git a/untitled folder/grace_dl/dist/memory/efsignsgd.py b/untitled folder/grace_dl/dist/memory/efsignsgd.py new file mode 100644 index 0000000..c681f0e --- /dev/null +++ b/untitled folder/grace_dl/dist/memory/efsignsgd.py @@ -0,0 +1,19 @@ +from grace_dl.torch import Memory + + +class EFSignSGDMemory(Memory): + def __init__(self, lr): + self.residuals = {} + self.learning_rate = lr + + def compensate(self, tensor, name): + """Update the tensor with the residuals.""" + if name in self.residuals: + tensor = self.residuals[name] + self.learning_rate * tensor + return tensor + + def update(self, tensor, name, compressor, tensor_compressed, ctx): + """Update the residuals.""" + tensor_decompressed = compressor.decompress(tensor_compressed, ctx) + residual = tensor - tensor_decompressed + self.residuals[name] = residual diff --git a/untitled folder/grace_dl/dist/memory/none.py b/untitled folder/grace_dl/dist/memory/none.py new file mode 100644 index 0000000..d603e99 --- /dev/null +++ b/untitled folder/grace_dl/dist/memory/none.py @@ -0,0 +1,11 @@ +from grace_dl.dist import Memory + + +class NoneMemory(Memory): + def compensate(self, tensor, name): + """Update the tensor with the residuals.""" + return tensor + + def update(self, tensor, name, compressor, tensor_compressed, ctx): + """Update the residuals.""" + pass diff --git a/untitled folder/grace_dl/dist/memory/powersgd.py b/untitled folder/grace_dl/dist/memory/powersgd.py new file mode 100644 index 0000000..96bde4e --- /dev/null +++ b/untitled folder/grace_dl/dist/memory/powersgd.py @@ -0,0 +1,37 @@ +import torch + +from grace_dl.dist import Memory + + +class PowerSGDMemory(Memory): + def __init__(self, q_memory, compress_rank=1): + self.compress_rank = compress_rank + self.q_memory = q_memory + self.residuals = {} + + def compensate(self, tensor, name): + """Update the tensor with the residuals.""" + if tensor.dim() == 1: + return tensor + + if name in self.q_memory: + tensor += self.residuals[name] + + shape = tensor.size() + n = shape[0] + m = 1 + for dim in shape[1:]: + m = m * dim + + r = min(n, m, self.compress_rank) + normal = torch.empty(m, r, dtype=tensor.dtype, layout=tensor.layout, device=tensor.device).normal_() + self.q_memory[name] = normal + + return tensor + + def update(self, tensor, name, compressor, tensor_compressed, ctx): + """Update the residuals.""" + if ctx is None: + return + + self.residuals[name] = tensor - compressor.decompress(tensor_compressed, ctx) diff --git a/untitled folder/grace_dl/dist/memory/residual.py b/untitled folder/grace_dl/dist/memory/residual.py new file mode 100644 index 0000000..e00a161 --- /dev/null +++ b/untitled folder/grace_dl/dist/memory/residual.py @@ -0,0 +1,20 @@ +from grace_dl.dist import Memory + + +class ResidualMemory(Memory): + def __init__(self, beta=1.0, gamma=1.0): + self.residuals = {} + self.beta = beta + self.gamma = gamma + + def compensate(self, tensor, name): + """Update the tensor with the residuals.""" + if name in self.residuals: + tensor = self.beta * self.residuals[name] + self.gamma * tensor + return tensor + + def update(self, tensor, name, compressor, tensor_compressed, ctx): + """Update the residuals.""" + tensor_decompressed = compressor.decompress(tensor_compressed, ctx) + residual = tensor - tensor_decompressed + self.residuals[name] = residual diff --git a/untitled folder/grace_dl/tensorflow/.DS_Store b/untitled folder/grace_dl/tensorflow/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..354ac1b19a8cd736f5444b51fb428fc7b68e8ed2 GIT binary patch literal 8196 zcmeHM%Wl&^6ur}i#%U?aqLtbeme@ujp+Oa5W74ok7{LNiuw$#WusxRTG^(OV-qYXU z2M`;!EcgWefju9I?=FLYxb z;5jb3ynEw?4xJ9@ByT8dIvh&IG&psC z-P`~1Y_-30CI9Fozm_grQti4=%{cUFc#J-boK2~61Tnaf;BkW#uH0N+-Aad|vR^jq zSjwx~{Jc=La?F5UD*89yf|`VX!79frDY=1Hs*4r%gR`4*jTSbQSFT_vDz@_-6)V{D zO8x%2P^#o}sZgCyK^b^~(-h|^DZ=ckCiN$_P|dAl;1AJaWw!tT literal 0 HcmV?d00001 diff --git a/untitled folder/grace_dl/tensorflow/__init__.py b/untitled folder/grace_dl/tensorflow/__init__.py new file mode 100644 index 0000000..5101fb8 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/__init__.py @@ -0,0 +1,114 @@ +from abc import ABC, abstractmethod + +import tensorflow as tf + + +class Memory(ABC): + """ + Interface for implementing a memory operator + """ + @staticmethod + def init_var(initial_value): + from collections import defaultdict + residuals_init = defaultdict(list) + for var in tf.trainable_variables(): + shape_str = ''.join(str(e) for e in var.get_shape().as_list()) + residuals_init[shape_str].append(tf.Variable(tf.zeros_like(var), trainable=False)) + + @abstractmethod + def compensate(self, tensor, name): + """ + Compensate the tensor with the residuals. + + :param name: a unique id for this tensor + :param tensor: the current state of the tensor + :return: tensor compensated with stored values + """ + raise NotImplemented("compensate was not implemented.") + + @abstractmethod + def update(self, tensor, name, compressor, tensor_compressed, ctx): + """ + Update the residuals. + + :param tensor: the original tensor + :param name: a unique id for this tensor + :param compressor: the compressor used to compress `tensor` + :param tensor_compressed: a collection of tensors result of `compressor.compress` + :param ctx: the context of compression + :return: a collection of memory operations to be executed + """ + raise NotImplemented("update was not implemented.") + + +class Compressor(ABC): + """ + Interface for compressing and decompressing a given tensor. + """ + + def __init__(self, average=True, tensors_size_are_same=True): + self.average = average + self.tensors_size_are_same = tensors_size_are_same + + @abstractmethod + def compress(self, tensor, name): + """ + Compresses a tensor and returns a collection of compressed tensors with the context needed to decompress it. + + :param tensor: a tensor to compress + :return: a collection of tensors and the context to decompress them + """ + raise NotImplemented("compress was not implemented.") + + @abstractmethod + def decompress(self, tensors, ctx): + """ + Decompress a collection of compressed tensors with the given context. + + :param tensors: a collection of tensors + :param ctx: context of the compression + """ + raise NotImplemented("decompress was not implemented.") + + def aggregate(self, tensors): + """ + Aggregate a collection of tensors. + + :param tensors: collection of tensors + :return: aggregated tensor + """ + return tf.math.add_n(tensors) + + +class Communicator(ABC): + @abstractmethod + def send_receive(self, tensors, ctx): + """ + This method should use the collective communication, aggregation, and decompression methods. + + :param tensors: list of tensors to be sent + :param ctx: compression context of the tensors + :return: a decompressed tensor, result of aggregating the tensors of each worker + """ + raise NotImplemented("send was not implemented.") + + def __init__(self, compressor, memory): + """ + :param compressor: provides compression and decompression operators + :param memory: provides compensation and update functions + """ + self.compressor = compressor + self.memory = memory + + def step(self, tensor, name): + """ + Compensate, compress, update, communicate, decompress, and aggregate tensors of all the workers + + :param tensor: the tensor to be communicated across workers + :return: the tensor after + """ + tensor = self.memory.compensate(tensor, name) + tensors_compressed, ctx = self.compressor.compress(tensor, name) + update_ops = self.memory.update(tensor, name, self.compressor, tensors_compressed, ctx) + with tf.control_dependencies(update_ops): + return self.send_receive(tensors_compressed, ctx) diff --git a/untitled folder/grace_dl/tensorflow/communicator/.DS_Store b/untitled folder/grace_dl/tensorflow/communicator/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 1.25 * tf.math.ceil(elemnum * compress_ratio), lambda: 1.25 * thr, + lambda: 0.9 * thr) + mask = tf.math.greater_equal(tf.math.abs(tensor_masked), thr) + selected = tf.math.reduce_sum(tf.cast(mask, dtype=tf.float32)) + return thr, mask, selected + + def condition(thr, mask, selected): + cond_a = selected > 1.25 * tf.math.ceil(elemnum * compress_ratio) + cond_b = selected < 0.8 * tf.math.ceil(elemnum * compress_ratio) + return tf.math.logical_or(cond_a, cond_b) + + thr2, mask2, selected2 = tf.while_loop(condition, body, (thr, mask, selected), maximum_iterations=20) + thr2 = tf.cond(selected2 < 1, lambda: 0.8 * thr2, lambda: thr2) + mask2 = tf.math.greater(tf.math.abs(tensor_masked), thr2) + + indices = tf.reshape(tf.where(mask2), [-1]) + tensor_value2 = tf.boolean_mask(tensor_masked, mask2) + mean = tf.reshape(tf.math.reduce_mean(tensor_value2), [-1]) + + return indices, mean, mask2 + + tensor_shape = tf.shape(tensor) + tensor_size = tf.size(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + plus_mask = tf.math.greater(tensor_flatten, 0) + minus_mask = tf.math.less(tensor_flatten, 0) + plus_indices, plus_mean, plus_mask = quan(tensor_flatten, plus_mask, self.compress_ratio) + minus_indices, minus_mean, minus_mask = quan(tensor_flatten, minus_mask, self.compress_ratio) + + plus_mean = tf.bitcast(plus_mean, tf.int32) + plus_indices = tf.reshape(tf.cast(plus_indices, dtype=tf.int32), [-1]) + minus_mean = tf.bitcast(minus_mean, tf.int32) + minus_indices = tf.reshape(tf.cast(minus_indices, dtype=tf.int32), [-1]) + plus_indices_size = tf.reshape(tf.size(plus_indices), [-1]) + tensor_compressed = tf.concat([plus_mean, minus_mean, plus_indices_size, plus_indices, minus_indices], 0) + ctx = tensor_shape, tensor_size + return [tensor_compressed], ctx + + def decompress(self, tensors_compressed, ctx): + tensor_compressed, = tensors_compressed + plus_mean = tensor_compressed[0] + minus_mean = tensor_compressed[1] + plus_indices_size = tensor_compressed[2] + plus_indices = tensor_compressed[3:3 + plus_indices_size] + minus_indices = tensor_compressed[3 + plus_indices_size:] + + plus_mean = tf.bitcast(plus_mean, tf.float32) + minus_mean = tf.bitcast(minus_mean, tf.float32) + tensor_shape, tensor_size = ctx + + plus_mean = tf.ones(tf.shape(plus_indices), dtype=tf.float32) * plus_mean + minus_mean = tf.ones(tf.shape(minus_indices), dtype=tf.float32) * minus_mean + indices = tf.expand_dims(plus_indices, 1) + plus_means = tf.scatter_nd(indices, plus_mean, [tensor_size]) + indices = tf.expand_dims(minus_indices, 1) + tensor_decompressed = tf.tensor_scatter_nd_update(plus_means, indices, minus_mean) + tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/dgc.py b/untitled folder/grace_dl/tensorflow/compressor/dgc.py new file mode 100644 index 0000000..af81765 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/dgc.py @@ -0,0 +1,68 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class DgcCompressor(Compressor): + """ + (2017). Deep Gradient Compression: Reducing the Communication Bandwidth for Distributed Training. + Retrieved from http://arxiv.org/abs/1712.01887 + """ + def __init__(self, compress_ratio): + super().__init__(tensors_size_are_same=False) + self.compress_ratio = compress_ratio + + def compress(self, tensor, name): + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + elemnum = tensor_flatten.get_shape().as_list()[0] + + sample_shape = tf.reshape(tf.convert_to_tensor(max(1, int(elemnum * 0.01)), dtype=tf.int32), [-1]) + sample_index = tf.random.uniform(sample_shape, minval=0, maxval=elemnum, dtype=tf.int32) + sample_tensor = tf.gather(tensor_flatten, sample_index) + + k = max(1, int(elemnum * self.compress_ratio * 0.01)) + vals, indices = tf.math.top_k(tf.math.abs(sample_tensor), k) + thr = tf.math.reduce_min(vals) + mask = tf.math.greater(tf.math.abs(tensor_flatten), thr) + + selected = tf.math.reduce_sum(tf.cast(mask, dtype=tf.float32)) + + def body(thr, mask, selected): + thr = tf.cond(selected > 1.25 * max(1, int(elemnum * self.compress_ratio)), lambda: 1.25 * thr, + lambda: 0.9 * thr) + mask = tf.math.greater(tf.math.abs(tensor_flatten), thr) + selected = tf.math.reduce_sum(tf.cast(mask, dtype=tf.float32)) + return thr, mask, selected + + def condition(thr, mask, selected): + cond_a = selected > 1.25 * max(1, int(elemnum * self.compress_ratio)) + cond_b = selected < 0.8 * max(1, int(elemnum * self.compress_ratio)) + return tf.math.logical_or(cond_a, cond_b) + + thr, mask, new_selected = tf.while_loop(condition, body, (thr, mask, selected), maximum_iterations=20) + + thr = tf.cond(new_selected < 1, lambda: 0.8 * thr, lambda: thr) + # mask = tf.math.greater_equal(tf.math.abs(tensor_flatten), thr) + mask = tf.math.greater(tf.math.abs(tensor_flatten), thr) # fix the dgc NCF data volume issue + + indices = tf.reshape(tf.where(mask), [-1]) + values = tf.gather(tensor_flatten, indices) + + values = tf.bitcast(values, tf.int32) + indices = tf.cast(indices, dtype=tf.int32) + tensor_compressed = tf.concat([values, indices], 0) + ctx = tensor_shape, mask + + return [tensor_compressed], ctx + + def decompress(self, tensors_compressed, ctx): + tensor_compressed, = tensors_compressed + tensor_shape, _ = ctx + values, indices = tf.split(tensor_compressed, 2) + values = tf.bitcast(values, tf.float32) + tensor_size = tf.math.reduce_prod(tensor_shape) + indices = tf.expand_dims(indices, 1) + tensor_decompressed = tf.scatter_nd(indices, values, [tensor_size]) + tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/efsignsgd.py b/untitled folder/grace_dl/tensorflow/compressor/efsignsgd.py new file mode 100644 index 0000000..f0b0ec2 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/efsignsgd.py @@ -0,0 +1,40 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class EFSignSGDCompressor(Compressor): + """ + (2019). Error Feedback Fixes SignSGD and other Gradient Compression Schemes. + Retrieved from http://arxiv.org/abs/1901.09847 + """ + def __init__(self, lr): + super().__init__() + self.learning_rate = lr + + def aggregate(self, tensors): + """Aggregate a list of tensors.""" + agged_tensor = tf.math.add_n(tensors) + agged_tensor = agged_tensor / self.learning_rate + return agged_tensor + + def compress(self, tensor, name): + """Encoding and compressing the signs """ + + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + sign_encode = tf.math.greater_equal(tensor_flatten, 0) + mean = tf.math.reduce_mean(tf.abs(tensor_flatten)) + ctx = tensor_shape + tensor_compressed = mean, sign_encode + + return tensor_compressed, ctx + + def decompress(self, tensor_compressed, ctx): + """Decoding the signs to float format """ + mean, sign_encode = tensor_compressed + tensor_shape = ctx + sign_decode = tf.cast(sign_encode, dtype=tf.float32) * 2.0 - 1.0 + sign_decode = mean * sign_decode + tensor_decompressed = tf.reshape(sign_decode, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/fp16.py b/untitled folder/grace_dl/tensorflow/compressor/fp16.py new file mode 100644 index 0000000..be246a4 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/fp16.py @@ -0,0 +1,25 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class FP16Compressor(Compressor): + """Compress all floating point gradients to 16-bit.""" + + def compress(self, tensor, name): + """Downcasts the tensor to 16-bit.""" + tensor_compressed = tensor + + if tensor.dtype.is_floating: + # Only allow compression from other floating point types + tensor_compressed = tf.cast(tensor, dtype=tf.float16) + return [tensor_compressed], tensor.dtype + + def decompress(self, tensor, ctx): + """Upcasts the tensor to the initialization dtype.""" + + tensor_decompressed, = tensor + dtype = ctx + if dtype.is_floating: + tensor_decompressed = tf.cast(tensor, dtype=dtype) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/inceptionn.py b/untitled folder/grace_dl/tensorflow/compressor/inceptionn.py new file mode 100644 index 0000000..b973598 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/inceptionn.py @@ -0,0 +1,188 @@ +import math + +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class INCEPTIONNCompressor(Compressor): + """ + (2018). A network-centric hardware/algorithm co-design to accelerate distributed training of deep neural networks. + https://doi.org/10.1109/MICRO.2018.00023 + """ + def __init__(self, error_bound): + super().__init__(tensors_size_are_same=False) + self.error_bound = error_bound + + def compress(self, tensor, name): + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + tensor_cast = tf.bitcast(tensor_flatten, tf.uint32) + sign = tf.bitwise.bitwise_and(tensor_cast, 0b10000000000000000000000000000000) + exp = tf.bitwise.bitwise_and(tensor_cast, 0b01111111100000000000000000000000) + mantissa = tf.bitwise.bitwise_and(tensor_cast, 0b00000000011111111111111111111111) + + exp = tf.bitwise.right_shift(exp, 23) + + error_bound_val = self.error_bound + error_bound = 127 + int(math.log(error_bound_val / 2, 10)) # error_bound exponent: 117 for 2e-10 + radius = math.ceil((127 - error_bound) / 2) + mid = error_bound + radius + mask_32bit = exp >= 127 + mask_16bit = (exp >= mid) & (exp < 127) + mask_8bit = (exp >= error_bound) & (exp < mid) + indices_32bit = tf.reshape(tf.where(mask_32bit), [-1]) + indices_16bit = tf.reshape(tf.where(mask_16bit), [-1]) + indices_8bit = tf.reshape(tf.where(mask_8bit), [-1]) + + # no compress + v_32bit = tf.gather(tensor_flatten, indices_32bit) + + # 16bit compress + s_16bit = tf.gather(sign, indices_16bit) + e_16bit = tf.gather(exp, indices_16bit) + m_16bit = tf.gather(mantissa, indices_16bit) + n_shift = 127 - tf.cast(e_16bit, dtype=tf.int32) + n_shift = tf.cast(n_shift, tf.uint32) + shifted_s = tf.bitwise.right_shift(s_16bit, 8) + marker = 0b00000000010000000000000000000000 + m_16bit_concat = tf.bitwise.bitwise_or(tf.bitwise.right_shift(m_16bit, 1), marker) + shifted_m = tf.bitwise.right_shift(m_16bit_concat, n_shift) + temp = tf.bitwise.bitwise_or(shifted_s, shifted_m) + v_16bit = tf.cast(tf.bitwise.right_shift(temp, 8), dtype=tf.uint16) + + # 8bit compress + s_8bit = tf.gather(sign, indices_8bit) + e_8bit = tf.gather(exp, indices_8bit) + m_8bit = tf.gather(mantissa, indices_8bit) + n_shift = 127 - tf.cast(e_8bit, dtype=tf.int32) + n_shift = tf.cast(n_shift, tf.uint32) + shifted_s = tf.bitwise.right_shift(s_8bit, 8) + marker = 0b00000000010000000000000000000000 + m_8bit_concat = tf.bitwise.bitwise_or(tf.bitwise.right_shift(m_8bit, 1), marker) + shifted_m = tf.bitwise.right_shift(m_8bit_concat, n_shift) + temp = tf.bitwise.bitwise_or(shifted_s, shifted_m) + v_8bit = tf.cast(tf.bitwise.right_shift(temp, 16), dtype=tf.uint8) + + def encode_byte(a): + # input: int32 type tensor with values in range 0,1,2,3 (2'b00,2'b01,2'b10,3'b11) + # output: encoded uint8 type tensor + a = tf.reshape(a, [-1]) + pad_size = 4 - tf.mod(tf.size(a), 4) + pad = tf.range(0, pad_size) + a = tf.concat([a, pad], 0) + a_split1, a_split2, a_split3, a_split4 = tf.split(a, 4) + + # encode 4 grads into 1 Byte + sum_1 = tf.add(a_split1, a_split2 * 4) + sum_2 = tf.add(a_split3 * 16, a_split4 * 64) + sum_all = tf.add(sum_1, sum_2) + return tf.cast(sum_all, tf.uint8) + + # encode indices + mask_encode = 0 + for mask, code in zip([mask_8bit, mask_16bit, mask_32bit], [1, 2, 3]): + mask_encode += tf.cast(mask, tf.int32) * code + mask_encode = encode_byte(mask_encode) + v_32bit = tf.reshape(v_32bit, [-1]) + v_16bit = tf.reshape(v_16bit, [-1]) + v_8bit = tf.reshape(v_8bit, [-1]) + mask_encode = tf.reshape(mask_encode, [-1]) + tensor_compressed = v_32bit, v_16bit, v_8bit, mask_encode + ctx = tensor_shape + + return tensor_compressed, ctx + + # decompress + def decompress(self, tensor_compressed, ctx): + + def decode_byte(encoded, real_size): + # input: encoded uint8 type tensor + # output: int32 type tensor with values in range 0,1,2,3 (2'b00,2'b01,2'b10,3'b11) + a = tf.cast(encoded, tf.int32) + a_split1 = tf.mod(a, 4) + a_split2 = tf.cast(tf.mod(a / 4, 4), tf.int32) + a_split3 = tf.cast(tf.mod(a / 16, 4), tf.int32) + a_split4 = tf.cast(tf.mod(a / 64, 4), tf.int32) + a = tf.concat([a_split1, a_split2, a_split3, a_split4], 0) + a = a[:real_size] + return a + + v_32bit, v_16bit, v_8bit, mask_encode = tensor_compressed + tensor_shape = ctx + tensor_size = tf.math.reduce_prod(tensor_shape) + + # decode mask and gather indices + mask_decode = decode_byte(mask_encode, tensor_size) + mask_32bit = tf.equal(mask_decode, 3) + mask_16bit = tf.equal(mask_decode, 2) + mask_8bit = tf.equal(mask_decode, 1) + indices_32bit = tf.reshape(tf.where(mask_32bit), [-1]) + indices_16bit = tf.reshape(tf.where(mask_16bit), [-1]) + indices_8bit = tf.reshape(tf.where(mask_8bit), [-1]) + + edges_16bit = [0, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536] + edges_8bit = [0, 2, 4, 8, 16, 32, 64, 128, 256] + import tensorflow_probability as tfp + + # 16bit decompress + # get the sign bit s_16bit and remove MSB from v_16bit + s_16bit = tf.bitwise.bitwise_and(v_16bit, 0b1000000000000000) + s_16bit = tf.cast(s_16bit, dtype=tf.int32) + s_16bit = tf.bitwise.left_shift(s_16bit, 16) + v_16bit = tf.bitwise.left_shift(v_16bit, 1) + + # 8bit decompress + # get the sign bit s_8bit and remove MSB from v_8bit + s_8bit = tf.bitwise.bitwise_and(v_8bit, 0b10000000) + s_8bit = tf.cast(s_8bit, dtype=tf.int32) + s_8bit = tf.bitwise.left_shift(s_8bit, 24) + v_8bit = tf.bitwise.left_shift(v_8bit, 1) + + # find the marker bit in v_16bit and get the exponent + zero_tensor = tf.Variable(tf.zeros([tensor_size], dtype=tf.int32), trainable=False) + op = zero_tensor.assign(tf.zeros([tensor_size], dtype=tf.int32)) + with tf.control_dependencies([op]): + temp = tf.scatter_update(zero_tensor, indices_16bit, tf.cast(v_16bit, tf.int32)) + temp = tf.scatter_update(temp, indices_8bit, tf.cast(v_8bit, tf.int32)) + n_shift_all = tfp.stats.find_bins(tf.cast(temp, dtype=tf.int32), edges_16bit) + + n_shift = 16 - tf.gather(n_shift_all, indices_16bit) + e_16bit = 127 - (n_shift - 1) + e_16bit = tf.bitwise.left_shift(e_16bit, 23) + + # restore the mantissa + n_shift = tf.cast(n_shift, dtype=tf.uint16) + v_16bit = tf.bitwise.left_shift(v_16bit, n_shift) + v_16bit = tf.cast(v_16bit, dtype=tf.int32) + m_16bit = tf.bitwise.left_shift(v_16bit, 7) + + # concat all + temp = tf.bitwise.bitwise_or(s_16bit, e_16bit) + v_16bit = tf.bitwise.bitwise_or(temp, m_16bit) + v_16bit = tf.bitcast(v_16bit, tf.float32) + + # find the marker bit in v_8bit and get the exponent + n_shift = 8 - tf.gather(n_shift_all, indices_8bit) + e_8bit = 127 - (n_shift - 1) + e_8bit = tf.bitwise.left_shift(e_8bit, 23) + + # restore the mantissa + n_shift = tf.cast(n_shift, dtype=tf.uint8) + v_8bit = tf.bitwise.left_shift(v_8bit, n_shift) + v_8bit = tf.cast(v_8bit, dtype=tf.int32) + m_8bit = tf.bitwise.left_shift(v_8bit, 15) + + # concat all + temp = tf.bitwise.bitwise_or(s_8bit, e_8bit) + v_8bit = tf.bitwise.bitwise_or(temp, m_8bit) + v_8bit = tf.bitcast(v_8bit, tf.float32) + + zero_tensor = tf.Variable(tf.zeros([tensor_size], dtype=tf.float32), trainable=False) + op = zero_tensor.assign(tf.zeros([tensor_size], dtype=tf.float32)) + with tf.control_dependencies([op]): + temp = tf.scatter_update(zero_tensor, indices_32bit, v_32bit) + temp = tf.scatter_update(temp, indices_16bit, v_16bit) + temp = tf.scatter_update(temp, indices_8bit, v_8bit) + tensor_decompressed = tf.reshape(temp, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/natural.py b/untitled folder/grace_dl/tensorflow/compressor/natural.py new file mode 100644 index 0000000..262be68 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/natural.py @@ -0,0 +1,42 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class NaturalCompressor(Compressor): + """ + (2019). Natural compression for distributed deep learning. + Retrieved from https://arxiv.org/abs/1905.10988 + """ + def compress(self, tensor, name): + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + + tensor_cast = tf.bitcast(tensor_flatten, tf.int32) + sign = tf.bitwise.bitwise_and(tensor_cast, 0b10000000000000000000000000000000) + exp = tf.bitwise.bitwise_and(tensor_cast, 0b01111111100000000000000000000000) + mantissa = tf.bitwise.bitwise_and(tensor_cast, 0b00000000011111111111111111111111) + exp_add_one = mantissa > tf.random.uniform(tf.shape(tensor_flatten), minval=0, maxval=0x007fffff, + dtype=tf.int32) + # exp_add_one = mantissa > 0x00400000 # deterministic + exponent = tf.where(exp_add_one, exp + 0b00000000100000000000000000000000, exp) + # original exponent range: -128 ~ 127, clip to -110, 17 + # view as uint8_t: 0 ~ 255 18 145 + exp_shift = tf.clip_by_value(exponent, 0b00001001000000000000000000000000, 0b01001000100000000000000000000000) + exps = tf.bitwise.right_shift(exp_shift, 23) + # shift 18 so that 0 maps to -110 and 127 maps to 145 + # set MSB if negative + exps = tf.bitwise.bitwise_or(tf.bitwise.right_shift(sign, 24), exps - 18) + tensor_compressed = tf.cast(exps, tf.uint8) + + return [tensor_compressed], tensor_shape + + def decompress(self, tensors_compressed, tensor_shape): + tensor_compressed, = tensors_compressed + sign = tensor_compressed > 127 + exps = tf.bitwise.bitwise_and(tensor_compressed, 0b01111111) + exps_shift = tf.bitwise.left_shift(tf.cast(exps + 18, tf.int32), 23) + floats = tf.bitcast(exps_shift, tf.float32) + tensor_decompressed = tf.where(sign, -floats, floats) + tensor_decompressed = tf.multiply(tf.cast(exps >= 1, tensor_decompressed.dtype), tensor_decompressed) + return tf.reshape(tensor_decompressed, tensor_shape) diff --git a/untitled folder/grace_dl/tensorflow/compressor/none.py b/untitled folder/grace_dl/tensorflow/compressor/none.py new file mode 100644 index 0000000..0fa908c --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/none.py @@ -0,0 +1,14 @@ +from grace_dl.tensorflow import Compressor + + +class NoneCompressor(Compressor): + """Default no-op compression.""" + + def compress(self, tensor, name): + """Returns the tensor unmodified.""" + return [tensor], None + + def decompress(self, tensors, ctx): + """Returns the tensor unmodified.""" + tensor, = tensors + return tensor diff --git a/untitled folder/grace_dl/tensorflow/compressor/onebit.py b/untitled folder/grace_dl/tensorflow/compressor/onebit.py new file mode 100644 index 0000000..f97733e --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/onebit.py @@ -0,0 +1,42 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class OneBitCompressor(Compressor): + """ + (2014). 1-bit stochastic gradient descent and its application to data-parallel distributed training of speech DNNs. + Retrieved from https://www.microsoft.com/en-us/research/publication/1-bit-stochastic-gradient-descent-and-application-to-data-parallel-distributed-training-of-speech-dnns/ + """ + def compress(self, tensor, name): + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + + mask0 = tf.math.less(tensor_flatten, 0) + sum0 = tf.math.reduce_sum(tf.boolean_mask(tensor_flatten, mask0)) + num0 = tf.math.reduce_sum(tf.cast(mask0, dtype=tf.float32)) + num0 = tf.where(tf.math.greater(num0, 0), num0, 1.0) + mean0 = sum0 / num0 + + mask1 = tf.math.logical_not(mask0) + sum1 = tf.math.reduce_sum(tf.boolean_mask(tensor_flatten, mask1)) + num1 = tf.math.reduce_sum(tf.cast(mask1, dtype=tf.float32)) + num1 = tf.where(tf.math.greater(num1, 0), num1, 1.0) + mean1 = sum1 / num1 + + mean0 = tf.reshape(mean0, [-1]) + mean1 = tf.reshape(mean1, [-1]) + mean = tf.concat([mean0, mean1], 0) + ctx = tensor_shape + tensor_compressed = mask0, mean + + return tensor_compressed, ctx + + def decompress(self, tensor_compressed, ctx): + tensor_shape = ctx + mask0, mean = tensor_compressed + mean0, mean1 = tf.split(mean, 2) + mask0 = tf.cast(mask0, dtype=tf.float32) + tensor_decompressed = mask0 * mean0 + (1 - mask0) * mean1 + tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/packing.py b/untitled folder/grace_dl/tensorflow/compressor/packing.py new file mode 100644 index 0000000..91ecd34 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/packing.py @@ -0,0 +1,30 @@ +import tensorflow as tf + + +def encode_byte(a): + # input: int32 type tensor with values in range 0,1,2,3 (2'b00,2'b01,2'b10,3'b11) + # output: encoded uint8 type tensor + a = tf.reshape(a, [-1]) + pad_size = 4 - tf.mod(tf.size(a), 4) + pad = tf.range(0, pad_size) + a = tf.concat([a, pad], 0) + a_split1, a_split2, a_split3, a_split4 = tf.split(a, 4) + + # encode 4 grads into 1 Byte + sum_1 = tf.add(a_split1, a_split2 * 4) + sum_2 = tf.add(a_split3 * 16, a_split4 * 64) + sum_all = tf.add(sum_1, sum_2) + return tf.cast(sum_all, tf.uint8) + + +def decode_byte(encoded, real_size): + # input: encoded uint8 type tensor + # output: int32 type tensor with values in range 0,1,2,3 (2'b00,2'b01,2'b10,3'b11) + a = tf.cast(encoded, tf.int32) + a_split1 = tf.mod(a, 4) + a_split2 = tf.cast(tf.mod(a / 4, 4), tf.int32) + a_split3 = tf.cast(tf.mod(a / 16, 4), tf.int32) + a_split4 = tf.cast(tf.mod(a / 64, 4), tf.int32) + a = tf.concat([a_split1, a_split2, a_split3, a_split4], 0) + a = a[:real_size] + return a \ No newline at end of file diff --git a/untitled folder/grace_dl/tensorflow/compressor/powersgd.py b/untitled folder/grace_dl/tensorflow/compressor/powersgd.py new file mode 100644 index 0000000..707b2a2 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/powersgd.py @@ -0,0 +1,58 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor +from horovod.tensorflow import _allreduce + + +class PowerSGDCompressor(Compressor): + """ + PowerSGD: Practical Low-Rank Gradient Compression for Distributed Optimization. + T. Vogels, S. P. Karimireddy, and M. Jaggi. In NeurIPS, 2019. + """ + + def __init__(self, momentum_factor, world_size): + super().__init__() + self.q_memory = {} + self.world_size = world_size + self.momentum = {} + self.momentum_factor = momentum_factor + for v in tf.trainable_variables(): + self.momentum[v.name] = tf.Variable(tf.zeros_like(v), trainable=False) + + def compress(self, tensor, name): + tensor_dims = len(tensor.get_shape().as_list()) + if tensor_dims == 1: + return [tensor], None + + tensor_shape = tf.shape(tensor) + matrix = tf.reshape(tensor, [tensor_shape[0], -1]) + + q = self.q_memory[name] + q, _ = tf.linalg.qr(q) + + p = tf.linalg.matmul(matrix, q) + p = _allreduce(p) / self.world_size + p, _ = tf.linalg.qr(p) + q = tf.linalg.matmul(matrix, p, transpose_a=True) + q = _allreduce(q) / self.world_size + new_q = self.q_memory[name].assign(q) + ctx = p, new_q, tensor_shape, name + # variable initialization needs to be called before communication starts + # self.momentum[tensor.name] = tf.Variable(tf.zeros_like(tensor), trainable=False) + + return [], ctx + + def decompress(self, tensors, ctx): + if ctx is None: + tensor, = tensors + return tensor + + p, q, tensor_shape, tensor_name = ctx + new_tensor = tf.linalg.matmul(p, q, transpose_b=True) + new_tensor = tf.reshape(new_tensor, tensor_shape) + + new_momentum = self.momentum[tensor_name].assign(self.momentum_factor * self.momentum[tensor_name] + + (1 - self.momentum_factor) * new_tensor) + tensor_decompressed = new_momentum + new_tensor + + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/qsgd.py b/untitled folder/grace_dl/tensorflow/compressor/qsgd.py new file mode 100644 index 0000000..608153e --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/qsgd.py @@ -0,0 +1,44 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class QSGDCompressor(Compressor): + """ + (2016). QSGD: Communication-Efficient SGD via Gradient Quantization and Encoding. + Retrieved from http://arxiv.org/abs/1610.02132 + """ + def __init__(self, quantum_num): + super().__init__() + self.quantum_num = quantum_num + + def compress(self, tensor, name): + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + norm = tf.reshape(tf.norm(tensor_flatten), [-1]) + abs_gradient = tf.abs(tensor_flatten) + qnum = tf.cast(self.quantum_num, dtype=tf.float32) + + level_float = qnum / norm * abs_gradient + previous_level = tf.math.floor(level_float) + prob = tf.random.uniform(tf.shape(tensor_flatten)) + is_next_level = tf.cast(tf.math.less(prob, (level_float - previous_level)), tf.float32) + new_level = tf.cast(previous_level + is_next_level, tf.float32) + + sign = tf.sign(tensor_flatten) + tensor_compressed = new_level * sign + tensor_compressed = tf.cast(tensor_compressed, dtype=tf.int8 if self.quantum_num < 128 else tf.int16) + tensor_compressed = tensor_compressed, norm + ctx = tensor_shape + + return tensor_compressed, ctx + + def decompress(self, tensor_compressed, ctx): + qnum = tf.cast(self.quantum_num, dtype=tf.float32) + tensor_shape = ctx + tensor_compressed, norm = tensor_compressed + + decode_output = tf.cast(tensor_compressed, dtype=tf.float32) + tensor_decompressed = norm / qnum * decode_output + tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/randomk.py b/untitled folder/grace_dl/tensorflow/compressor/randomk.py new file mode 100644 index 0000000..983c3cf --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/randomk.py @@ -0,0 +1,53 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +def sparsify(tensor, compress_ratio): + elemnum = tensor.get_shape().as_list()[0] + k = max(1, int(elemnum * compress_ratio)) + _, indices = tf.math.top_k(tf.math.abs(tensor), k) + values = tf.gather(tensor, indices) + return indices, values + + +def desparsify(indices, values, tensor_size): + indices = tf.expand_dims(indices, 1) + tensor = tf.scatter_nd(indices, values, [tensor_size]) + return tensor + + +class RandomKCompressor(Compressor): + global_step = 0 + + def __init__(self, compress_ratio): + super().__init__() + self.compress_ratio = compress_ratio + + def compress(self, tensor, name): + """Use Python Random libraries RNG to compress by generating a list of indices to be transmitted.""" + + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + elemnum = tensor_flatten.get_shape().as_list()[0] + # all_indices = tf.range(elemnum, dtype=tf.int32) + k = max(1, int(elemnum * self.compress_ratio)) + seed = (sum([ord(s) for s in tensor.name]), tf.train.get_global_step()) + tf.compat.v1.set_random_seed(1) + indices = tf.random.stateless_uniform( + [k], seed, minval=0, maxval=elemnum + 1, dtype=tf.int32, name=None + ) + values = tf.gather(tensor_flatten, indices) + ctx = indices, tensor_shape + tensor_compressed = values + + return [tensor_compressed], ctx + + def decompress(self, tensor_compressed, ctx): + """Decompress by filling empty slots with zeros and reshape back using the original shape""" + indices, tensor_shape = ctx + values, = tensor_compressed + tensor_size = tf.math.reduce_prod(tensor_shape) + tensor_decompressed = desparsify(indices, values, tensor_size) + tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/signsgd.py b/untitled folder/grace_dl/tensorflow/compressor/signsgd.py new file mode 100644 index 0000000..7a1a0c7 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/signsgd.py @@ -0,0 +1,30 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class SignSGDCompressor(Compressor): + """ + (2018). signSGD: Compressed Optimisation for Non-Convex Problems. + Retrieved from https://arxiv.org/abs/1802.04434 + """ + def aggregate(self, tensors): + """Aggregate a list of tensors.""" + agged_tensor = tf.math.add_n(tensors) + agged_tensor = tf.cast(tf.math.greater_equal(agged_tensor, 0), dtype=tf.float32) + agged_tensor = agged_tensor * 2.0 - 1.0 + return agged_tensor + + def compress(self, tensor, name): + """Encoding and compressing the signs """ + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + tensor_compressed = tf.math.greater_equal(tensor_flatten, 0) + return [tensor_compressed], tensor_shape + + def decompress(self, tensors, tensor_shape): + """Decoding the signs to float format """ + sign_encode, = tensors + sign_decode = tf.cast(sign_encode, dtype=tf.float32) * 2.0 - 1.0 + tensor_decompressed = tf.reshape(sign_decode, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/signum.py b/untitled folder/grace_dl/tensorflow/compressor/signum.py new file mode 100644 index 0000000..da0ab48 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/signum.py @@ -0,0 +1,45 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class SignumCompressor(Compressor): + """ + (2018). signSGD: Compressed Optimisation for Non-Convex Problems. + Retrieved from https://arxiv.org/abs/1802.04434 + """ + + def __init__(self, momentum): + super().__init__() + self.momentum_factor = momentum + self.momentum = {} + for v in tf.trainable_variables(): + self.momentum[v.name] = tf.Variable(tf.zeros_like(v), trainable=False) + + def aggregate(self, tensors): + """Aggregate a list of tensors.""" + agged_tensor = tf.math.add_n(tensors) + agged_tensor = tf.cast(tf.math.greater_equal(agged_tensor, 0), dtype=tf.float32) + agged_tensor = agged_tensor * 2.0 - 1.0 + return agged_tensor + + def compress(self, tensor, name): + """Encoding and compressing the signs """ + + # update tensor by momentum + tensor = (1.0 - self.momentum_factor) * tensor + self.momentum_factor * self.momentum[name] + op = self.momentum[name].assign(tensor) + + with tf.control_dependencies([op]): + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + tensor_compressed = tf.math.greater_equal(tensor_flatten, 0) + + return [tensor_compressed], tensor_shape + + def decompress(self, tensors, tensor_shape): + """Decoding the signs to float format """ + sign_encode, = tensors + sign_decode = tf.cast(sign_encode, dtype=tf.float32) * 2.0 - 1.0 + tensor_decompressed = tf.reshape(sign_decode, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/sketch.py b/untitled folder/grace_dl/tensorflow/compressor/sketch.py new file mode 100644 index 0000000..c7022b4 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/sketch.py @@ -0,0 +1,39 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class SketchCompressor(Compressor): + """ + SketchML: Accelerating Distributed Machine Learning with Data Sketches. + J. Jiang, F. Fu, T. Yang, and B. Cui. SIGMOD, 2018. + """ + + def __init__(self, quantiles): + super().__init__() + self.quantiles = quantiles + + def compress(self, tensor, name): + import tensorflow_probability as tfp + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + + x = tensor_flatten + edges = tfp.stats.quantiles(x, num_quantiles=self.quantiles, interpolation='linear') + bins = tf.cast(tfp.stats.find_bins(x, edges), dtype=tf.int32) + means = tf.math.unsorted_segment_mean(x, bins, num_segments=self.quantiles) + + tensor_compressed = tf.cast(bins, dtype=tf.uint8 if self.quantiles < 256 else tf.uint16) + means = tf.reshape(means, [-1]) + ctx = tensor_shape + tensor_compressed = tensor_compressed, means + + return tensor_compressed, ctx + + def decompress(self, tensor_compressed, ctx): + tensor_shape = ctx + tensor_compressed, means = tensor_compressed + bins = tf.cast(tensor_compressed, dtype=tf.int32) + tensor_decompressed = tf.gather(means, bins) + tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/terngrad.py b/untitled folder/grace_dl/tensorflow/compressor/terngrad.py new file mode 100644 index 0000000..b647030 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/terngrad.py @@ -0,0 +1,42 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class TernGradCompressor(Compressor): + """ + (2017). TernGrad: Ternary Gradients to Reduce Communication in Distributed Deep Learning. + Retrieved from http://arxiv.org/abs/1705.07878 + """ + def compress(self, tensor, name): + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + + std = tf.math.square(tensor_flatten - tf.math.reduce_mean(tensor_flatten)) + std = tf.math.sqrt(tf.math.reduce_mean(std)) + c = 2.5 + gradient = tf.clip_by_value(tensor_flatten, -c * std, c * std) + scaler = tf.math.reduce_max(tf.math.abs(gradient)) + + zeros = tf.zeros(tf.shape(tensor_flatten)) + abs_gradient = tf.abs(gradient) + sign_gradient = tf.sign(gradient) + rnd_sample = tf.random.uniform(tf.shape(tensor_flatten), 0, scaler) + where_cond = tf.less(rnd_sample, abs_gradient) + binarized_gradient = tf.where(where_cond, sign_gradient * scaler, zeros) + new_sign = tf.sign(binarized_gradient) # -1, 0, 1 + + scaler = tf.reshape(scaler, [-1]) + ctx = tensor_shape + tensor_compressed = tf.cast(new_sign, tf.int8), scaler + + return tensor_compressed, ctx + + def decompress(self, tensor_compressed, ctx): + tensor_shape = ctx + tensor_compressed, scaler = tensor_compressed + + sign = tf.cast(tensor_compressed, dtype=tf.float32) + tensor_decompressed = sign * scaler + tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/threshold.py b/untitled folder/grace_dl/tensorflow/compressor/threshold.py new file mode 100644 index 0000000..dd30042 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/threshold.py @@ -0,0 +1,33 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class ThresholdCompressor(Compressor): + + def __init__(self, threshold): + super().__init__(tensors_size_are_same=False) + self.threshold = threshold + + def compress(self, tensor, name): + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + thr_mask = tf.math.greater(tf.math.abs(tensor_flatten), self.threshold) + values = tf.boolean_mask(tensor_flatten, thr_mask) + indices = tf.reshape(tf.where(thr_mask), [-1]) + ctx = tensor_shape + values = tf.bitcast(values, tf.int32) + indices = tf.cast(indices, dtype=tf.int32) + tensor_compressed = tf.concat([values, indices], 0) + return [tensor_compressed], ctx + + def decompress(self, tensors_compressed, ctx): + tensor_compressed, = tensors_compressed + values, indices = tf.split(tensor_compressed, 2) + values = tf.bitcast(values, tf.float32) + tensor_shape = ctx + tensor_size = tf.math.reduce_prod(tensor_shape) + indices = tf.expand_dims(indices, 1) + tensor_decompressed = tf.scatter_nd(indices, values, [tensor_size]) + tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/topk.py b/untitled folder/grace_dl/tensorflow/compressor/topk.py new file mode 100644 index 0000000..773cd77 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/topk.py @@ -0,0 +1,49 @@ +import tensorflow as tf + +from grace.grace_dl.tensorflow import Compressor + + +def sparsify(tensor, compress_ratio): + elemnum = tensor.get_shape().as_list()[0] + k = max(1, int(elemnum * compress_ratio)) + _, indices = tf.math.top_k(tf.math.abs(tensor), k, sorted=False) + values = tf.gather(tensor, indices) + return values, indices + + +def desparsify(indices, values, tensor_size): + indices = tf.expand_dims(indices, 1) + tensor = tf.scatter_nd(indices, values, [tensor_size]) + return tensor + + +class TopKCompressor(Compressor): + def __init__(self, compress_ratio): + super().__init__() + self.compress_ratio = compress_ratio + + def compress(self, tensor, name): + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + elemnum = tensor_flatten.get_shape().as_list()[0] + + values, indices = sparsify(tensor_flatten, self.compress_ratio) + + indices = tf.cast(indices, tf.int32) + values = tf.bitcast(values, tf.int32) + tensor_compressed = tf.concat([values, indices], 0) + ctx = tensor_shape, elemnum + return [tensor_compressed], ctx + + def decompress(self, tensors_compressed, ctx): + """Decompress by filling empty slots with zeros and reshape back using the original shape""" + tensor_compressed, = tensors_compressed + values, indices = tf.split(tensor_compressed, 2) + values = tf.bitcast(values, tf.float32) + tensor_shape, tensor_size = ctx + #tensor_size = tf.math.reduce_prod(tensor_shape) + + tensor_decompressed = desparsify(indices, values, tensor_size) + + tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/u8bit.py b/untitled folder/grace_dl/tensorflow/compressor/u8bit.py new file mode 100644 index 0000000..18b32b9 --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/compressor/u8bit.py @@ -0,0 +1,110 @@ +import tensorflow as tf + +from grace_dl.tensorflow import Compressor + + +class U8bitCompressor(Compressor): + """ + (2015). 8-Bit Approximations for Parallelism in Deep Learning. + Retrieved from http://arxiv.org/abs/1511.04561 + """ + def compress(self, tensor, name): + dict128 = tf.constant( + [ + 1.5000001e-06, 2.7500000e-06, 7.2499997e-06, 1.8750001e-05, + 3.6250000e-05, 5.8749996e-05, 8.6249995e-05, 1.4375000e-04, + 2.3125000e-04, 3.1875001e-04, 4.0625001e-04, 5.1874999e-04, + 6.5624999e-04, 7.9374999e-04, 9.3124999e-04, 1.2187500e-03, + 1.6562500e-03, 2.0937501e-03, 2.5312500e-03, 2.9687500e-03, + 3.4062499e-03, 3.8437501e-03, 4.2812498e-03, 4.8437500e-03, + 5.5312500e-03, 6.2187500e-03, 6.9062500e-03, 7.5937500e-03, + 8.2812496e-03, 8.9687500e-03, 9.6562495e-03, 1.1093750e-02, + 1.3281250e-02, 1.5468750e-02, 1.7656250e-02, 1.9843750e-02, + 2.2031249e-02, 2.4218749e-02, 2.6406251e-02, 2.8593751e-02, + 3.0781250e-02, 3.2968748e-02, 3.5156250e-02, 3.7343752e-02, + 3.9531250e-02, 4.1718751e-02, 4.3906249e-02, 4.6718750e-02, + 5.0156251e-02, 5.3593751e-02, 5.7031251e-02, 6.0468748e-02, + 6.3906237e-02, 6.7343749e-02, 7.0781253e-02, 7.4218743e-02, + 7.7656247e-02, 8.1093743e-02, 8.4531240e-02, 8.7968737e-02, + 9.1406241e-02, 9.4843738e-02, 9.8281242e-02, 1.0546875e-01, + 1.1640625e-01, 1.2734374e-01, 1.3828126e-01, 1.4921875e-01, + 1.6015625e-01, 1.7109375e-01, 1.8203124e-01, 1.9296876e-01, + 2.0390625e-01, 2.1484375e-01, 2.2578125e-01, 2.3671874e-01, + 2.4765626e-01, 2.5859374e-01, 2.6953125e-01, 2.8046876e-01, + 2.9140624e-01, 3.0234376e-01, 3.1328124e-01, 3.2421875e-01, + 3.3515626e-01, 3.4609374e-01, 3.5703126e-01, 3.6796874e-01, + 3.7890625e-01, 3.8984376e-01, 4.0078124e-01, 4.1171876e-01, + 4.2265624e-01, 4.3359375e-01, 4.4453126e-01, 4.5859376e-01, + 4.7578123e-01, 4.9296874e-01, 5.1015621e-01, 5.2734375e-01, + 5.4453123e-01, 5.6171870e-01, 5.7890624e-01, 5.9609371e-01, + 6.1328125e-01, 6.3046873e-01, 6.4765620e-01, 6.6484374e-01, + 6.8203121e-01, 6.9921869e-01, 7.1640623e-01, 7.3359370e-01, + 7.5078118e-01, 7.6796871e-01, 7.8515619e-01, 8.0234367e-01, + 8.1953120e-01, 8.3671868e-01, 8.5390615e-01, 8.7109369e-01, + 8.8828117e-01, 9.0546864e-01, 9.2265618e-01, 9.3984365e-01, + 9.5703113e-01, 9.7421867e-01, 9.9140614e-01, 9.9570298e-01, + ], dtype=tf.float32 + ) + + tensor_shape = tf.shape(tensor) + tensor_flatten = tf.reshape(tensor, [-1]) + + scaler = tf.math.reduce_max(tf.abs(tensor_flatten)) + new_tensor = tensor_flatten / scaler + sign = tf.sign(tensor_flatten) + new_tensor = tf.abs(new_tensor) + + import tensorflow_probability as tfp + edges = dict128 + bins = tf.cast(tfp.stats.find_bins(new_tensor, edges), dtype=tf.int8) + + scaler = tf.reshape(scaler, [-1]) + tensor_compressed = bins * tf.cast(sign, dtype=tf.int8), scaler + + return tensor_compressed, tensor_shape + + def decompress(self, tensor_compressed, tensor_shape): + tensor, scaler = tensor_compressed + dict128 = tf.constant( + [ + 1.5000001e-06, 2.7500000e-06, 7.2499997e-06, 1.8750001e-05, + 3.6250000e-05, 5.8749996e-05, 8.6249995e-05, 1.4375000e-04, + 2.3125000e-04, 3.1875001e-04, 4.0625001e-04, 5.1874999e-04, + 6.5624999e-04, 7.9374999e-04, 9.3124999e-04, 1.2187500e-03, + 1.6562500e-03, 2.0937501e-03, 2.5312500e-03, 2.9687500e-03, + 3.4062499e-03, 3.8437501e-03, 4.2812498e-03, 4.8437500e-03, + 5.5312500e-03, 6.2187500e-03, 6.9062500e-03, 7.5937500e-03, + 8.2812496e-03, 8.9687500e-03, 9.6562495e-03, 1.1093750e-02, + 1.3281250e-02, 1.5468750e-02, 1.7656250e-02, 1.9843750e-02, + 2.2031249e-02, 2.4218749e-02, 2.6406251e-02, 2.8593751e-02, + 3.0781250e-02, 3.2968748e-02, 3.5156250e-02, 3.7343752e-02, + 3.9531250e-02, 4.1718751e-02, 4.3906249e-02, 4.6718750e-02, + 5.0156251e-02, 5.3593751e-02, 5.7031251e-02, 6.0468748e-02, + 6.3906237e-02, 6.7343749e-02, 7.0781253e-02, 7.4218743e-02, + 7.7656247e-02, 8.1093743e-02, 8.4531240e-02, 8.7968737e-02, + 9.1406241e-02, 9.4843738e-02, 9.8281242e-02, 1.0546875e-01, + 1.1640625e-01, 1.2734374e-01, 1.3828126e-01, 1.4921875e-01, + 1.6015625e-01, 1.7109375e-01, 1.8203124e-01, 1.9296876e-01, + 2.0390625e-01, 2.1484375e-01, 2.2578125e-01, 2.3671874e-01, + 2.4765626e-01, 2.5859374e-01, 2.6953125e-01, 2.8046876e-01, + 2.9140624e-01, 3.0234376e-01, 3.1328124e-01, 3.2421875e-01, + 3.3515626e-01, 3.4609374e-01, 3.5703126e-01, 3.6796874e-01, + 3.7890625e-01, 3.8984376e-01, 4.0078124e-01, 4.1171876e-01, + 4.2265624e-01, 4.3359375e-01, 4.4453126e-01, 4.5859376e-01, + 4.7578123e-01, 4.9296874e-01, 5.1015621e-01, 5.2734375e-01, + 5.4453123e-01, 5.6171870e-01, 5.7890624e-01, 5.9609371e-01, + 6.1328125e-01, 6.3046873e-01, 6.4765620e-01, 6.6484374e-01, + 6.8203121e-01, 6.9921869e-01, 7.1640623e-01, 7.3359370e-01, + 7.5078118e-01, 7.6796871e-01, 7.8515619e-01, 8.0234367e-01, + 8.1953120e-01, 8.3671868e-01, 8.5390615e-01, 8.7109369e-01, + 8.8828117e-01, 9.0546864e-01, 9.2265618e-01, 9.3984365e-01, + 9.5703113e-01, 9.7421867e-01, 9.9140614e-01, 9.9570298e-01, + ], dtype=tf.float32 + ) + # tensor is int8 + tensor = tf.cast(tensor, dtype=tf.int32) + sign = tf.cast(tf.sign(tensor), dtype=tf.float32) + index = tf.cast(tf.abs(tensor), dtype=tf.int32) + tensor_decompressed = tf.gather(dict128, index) * scaler * sign + tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/helper.py b/untitled folder/grace_dl/tensorflow/helper.py new file mode 100644 index 0000000..3a2f1ce --- /dev/null +++ b/untitled folder/grace_dl/tensorflow/helper.py @@ -0,0 +1,107 @@ +def grace_from_params(params): + import horovod.tensorflow as hvd + world_size = hvd.size() + comp = params.get('compressor', 'none') + mem = params.get('memory', 'none') + comm = params.get('communicator', 'allreduce') + if comp == 'adaq': + from grace_dl.tensorflow.compressor.adaq import AdaqCompressor + compress_ratio = params.get('compress_ratio', 0.3) + compressor = AdaqCompressor(compress_ratio) + elif comp == 'dgc': + from grace_dl.tensorflow.compressor.dgc import DgcCompressor + compress_ratio = params.get('compress_ratio', 0.3) + compressor = DgcCompressor(compress_ratio) + elif comp == 'efsignsgd': + from grace_dl.tensorflow.compressor.efsignsgd import EFSignSGDCompressor + lr = params.get('lr', 0.1) + compressor = EFSignSGDCompressor(lr) + elif comp == 'fp16': + from grace_dl.tensorflow.compressor.fp16 import FP16Compressor + compressor = FP16Compressor() + elif comp == 'inceptionn': + from grace_dl.tensorflow.compressor.inceptionn import INCEPTIONNCompressor + error_bound = params.get('error_bound', 2e-10) + compressor = INCEPTIONNCompressor(error_bound) + elif comp == 'natural': + from grace_dl.tensorflow.compressor.natural import NaturalCompressor + compressor = NaturalCompressor() + elif comp == 'none': + from grace_dl.tensorflow.compressor.none import NoneCompressor + compressor = NoneCompressor() + elif comp == 'onebit': + from grace_dl.tensorflow.compressor.onebit import OneBitCompressor + compressor = OneBitCompressor() + elif comp == 'powersgd': + from grace_dl.tensorflow.compressor.powersgd import PowerSGDCompressor + momentum_factor = params.get('momentum_factor', 0.9) + compressor = PowerSGDCompressor(momentum_factor, world_size) + elif comp == 'qsgd': + from grace_dl.tensorflow.compressor.qsgd import QSGDCompressor + quantum_num = params.get('quantum_num', 127) + compressor = QSGDCompressor(quantum_num) + elif comp == 'randomk': + from grace_dl.tensorflow.compressor.randomk import RandomKCompressor + compress_ratio = params.get('compress_ratio', 0.3) + compressor = RandomKCompressor(compress_ratio) + elif comp == 'signsgd': + from grace_dl.tensorflow.compressor.signsgd import SignSGDCompressor + compressor = SignSGDCompressor() + elif comp == 'signum': + from grace_dl.tensorflow.compressor.signum import SignumCompressor + momentum = params.get('momentum', 0.9) + compressor = SignumCompressor(momentum) + elif comp == 'sketch': + from grace_dl.tensorflow.compressor.sketch import SketchCompressor + quantiles = params.get('quantiles', 64) + compressor = SketchCompressor(quantiles) + elif comp == 'terngrad': + from grace_dl.tensorflow.compressor.terngrad import TernGradCompressor + compressor = TernGradCompressor() + elif comp == 'threshold': + from grace_dl.tensorflow.compressor.threshold import ThresholdCompressor + threshold = params.get('threshold', 0.01) + compressor = ThresholdCompressor(threshold) + elif comp == 'topk': + from grace_dl.tensorflow.compressor.topk import TopKCompressor + compress_ratio = params.get('compress_ratio', 0.3) + compressor = TopKCompressor(compress_ratio) + elif comp == 'u8bit': + from grace_dl.tensorflow.compressor.u8bit import U8bitCompressor + compressor = U8bitCompressor() + else: + raise NotImplementedError(comp) + + if mem == 'dgc': + from grace_dl.tensorflow.memory.dgc import DgcMemory + momentum = params.get('momentum', 0.9) + gradient_clipping = params.get('gradient_clipping', False) + memory = DgcMemory(momentum, gradient_clipping, world_size) + elif mem == 'none': + from grace_dl.tensorflow.memory.none import NoneMemory + memory = NoneMemory() + elif mem == 'powersgd': + from grace_dl.tensorflow.memory.powersgd import PowerSGDMemory + compress_rank = params.get('compress_rank', 1) + memory = PowerSGDMemory(compressor.q_memory, compress_rank) + elif mem == 'residual': + from grace_dl.tensorflow.memory.residual import ResidualMemory + memory = ResidualMemory() + elif mem == 'efsignsgd': + from grace_dl.tensorflow.memory.efsignsgd import EFSignSGDMemory + lr = params.get('lr', 0.1) + memory = EFSignSGDMemory(lr) + else: + raise NotImplementedError(mem) + + if comm == 'allreduce': + from grace_dl.tensorflow.communicator.allreduce import Allreduce + return Allreduce(compressor, memory, world_size) + elif comm == 'allgather': + from grace_dl.tensorflow.communicator.allgather import Allgather + return Allgather(compressor, memory, world_size) + elif comm == 'broadcast': + from grace_dl.tensorflow.communicator.broadcast import Broadcast + return Broadcast(compressor, memory, world_size) + else: + raise NotImplementedError(comm) diff --git a/untitled folder/grace_dl/tensorflow/memory/.DS_Store b/untitled folder/grace_dl/tensorflow/memory/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0fgc5S?v9>(Bx@v{HMs#5IVdqD2)KH-tOF2o8XP9b2u1?X}{BsEQ)_4u6O% z2Yv!vIl?bM+?d(jrgmZn^@3=-W9>d?cV<7{i@llz09a?_wE=bjz`#P-*ubj9sGrJ$ zt*Dl5M27JIBZwh@J~%L3@YW5Q0nLDBKr^5j&lQ}vRBud0nNa*WPsie zCKf_VVJ1<$IR58IJuXxg%N%Gl?=B zm<$J|W@aiwVPbZSGle@aOQNnd1Db(F1}Jym#EhLogjYU)=P+=`Y2t?5MW0V@5(eq8 z)%wa-R%fxcvGth8Lnny+fd~jZ zA6;I*3}TOuJA4#-sfe{@hfz1`y~gfja&P~zY3?1gr%iLxZav&tgDzPE*rqGFBB`To38 z!S9(;g)t(2u27XvQJJTtOks>UMW|n!X5ex$uqv0kL;3%c>fiq_*QWGpH3OP~3o*cI zUAx=C5ISzljB;%g%UdjDh_gQo@!mvNim?=C60rsCpZ^eWs?UEx KysoY_1AhR)|7g?z literal 0 HcmV?d00001 diff --git a/untitled folder/grace_dl/torch/__init__.py b/untitled folder/grace_dl/torch/__init__.py new file mode 100644 index 0000000..4030ee4 --- /dev/null +++ b/untitled folder/grace_dl/torch/__init__.py @@ -0,0 +1,58 @@ +from abc import ABC, abstractmethod + + +class Memory(ABC): + @abstractmethod + def compensate(self, tensor, name): + """Update the tensor with the residuals.""" + raise NotImplemented("compensate was not implemented.") + + def update(self, tensor, name, compressor, tensor_compressed, ctx): + """Update the residuals.""" + pass + + +class Compressor(ABC): + """Interface for compressing and decompressing a given tensor.""" + + def __init__(self, average=True, tensors_size_are_same=True): + self.average = average + self.tensors_size_are_same = tensors_size_are_same + + @abstractmethod + def compress(self, tensor, name): + """Compresses a tensor and returns it with the context needed to decompress it.""" + raise NotImplemented("compress was not implemented.") + + @abstractmethod + def decompress(self, tensors, ctx): + """Decompress the tensor with the given context.""" + raise NotImplemented("decompress was not implemented.") + + def aggregate(self, tensors): + """Aggregate a list of tensors.""" + return sum(tensors) + + +class Communicator(ABC): + @abstractmethod + def async_send(self, tensors, name,process_set): + raise NotImplemented("async_send was not implemented.") + + @abstractmethod + def wait_receive(self, handles, ctx,process_set): + raise NotImplemented("wait_receive was not implemented.") + + def __init__(self, compressor, memory): + self.compressor = compressor + self.memory = memory + + def send_step(self, tensor, name,process_set): + tensor = self.memory.compensate(tensor, name) + tensors_compressed, ctx = self.compressor.compress(tensor, name) + self.memory.update(tensor, name, self.compressor, tensors_compressed, ctx) + handles = self.async_send(tensors_compressed, name,process_set) + return handles, ctx + + def receive_step(self, handles, ctx,process_set): + return self.wait_receive(handles, ctx,process_set) diff --git a/untitled folder/grace_dl/torch/communicator/.DS_Store b/untitled folder/grace_dl/torch/communicator/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T00 else t.numel() + tensors_size.append(size_dim0) + + if self.compressor.tensors_size_are_same: + tensors_size_ag = [tensors_size] * self.world_size # list of tensor sizes per rank + tensor_sizes = zip(*tensors_size_ag) # transpose + else: + tensors_size = torch.tensor(tensors_size) # TODO: set device + gathered = allgather(tensor=tensors_size,process_set=process_set) # tensor of tensor sizes per rank + tensor_sizes = gathered.view([self.world_size, -1]).t().tolist() # transpose, to list + + handles = [] + for tensor_compressed in tensors_compressed: + handle = allgather_async(tensor=tensor_compressed,process_set=process_set) + handles.append(handle) + + return handles, tensor_sizes + + def wait_receive(self, result, ctx,process_set): + handles, tensor_sizes = result + # print("Result",result) + tensors_ag = [] + for handle, sizes in zip(handles, tensor_sizes): + gathered = synchronize(handle) + # print("gathered",gathered) + # print("gathered",len(gathered)) + # print("sizes",sizes) + tensors_ag.append(gathered.split(sizes)) + # import time + # time.sleep(0.001) + list_tensor_decompressed = [] + for tensor_compressed in zip(*tensors_ag): + tensor_decompressed = self.compressor.decompress(tensor_compressed, ctx) + list_tensor_decompressed.append(tensor_decompressed) + + tensors_aggregated = self.compressor.aggregate(list_tensor_decompressed) + return (tensors_aggregated / self.world_size) if self.compressor.average else tensors_aggregated diff --git a/untitled folder/grace_dl/torch/communicator/allreduce.py b/untitled folder/grace_dl/torch/communicator/allreduce.py new file mode 100644 index 0000000..98e5396 --- /dev/null +++ b/untitled folder/grace_dl/torch/communicator/allreduce.py @@ -0,0 +1,15 @@ +from grace.grace_dl.torch import Communicator +from horovod.torch import allreduce_async_, synchronize + + +class Allreduce(Communicator): + + def async_send(self, tensors_compressed, name): + handles = [] + for i, tensor_compressed in enumerate(tensors_compressed): + handles.append(allreduce_async_(tensor_compressed, self.compressor.average, name + str(i))) + return handles + + def wait_receive(self, handles, ctx): + output = [synchronize(h) for h in handles] + return self.compressor.decompress(output, ctx) diff --git a/untitled folder/grace_dl/torch/communicator/broadcast.py b/untitled folder/grace_dl/torch/communicator/broadcast.py new file mode 100644 index 0000000..7b906e6 --- /dev/null +++ b/untitled folder/grace_dl/torch/communicator/broadcast.py @@ -0,0 +1,27 @@ +from grace_dl.torch import Communicator +from horovod.torch import broadcast_async, synchronize + + +class Broadcast(Communicator): + + def __init__(self, compressor, memory, world_size): + super().__init__(compressor, memory) + self.world_size = world_size + + def async_send(self, tensors_compressed, name): + handles = [] + for root_rank in range(self.world_size): + rank_handles = [] + for i, tensor_compressed in enumerate(tensors_compressed): + rank_handles.append(broadcast_async(tensor_compressed, root_rank, name + str(root_rank) + '_' + str(i))) + handles.append(rank_handles) + return handles + + def wait_receive(self, handles, ctx): + tensors_decompressed = [] + for ranki in handles: + tensors_compressed = [synchronize(h) for h in ranki] + tensor_decompressed = self.compressor.decompress(tensors_compressed, ctx) + tensors_decompressed.append(tensor_decompressed) + tensor_aggregated = self.compressor.aggregate(tensors_decompressed) + return (tensor_aggregated / self.world_size) if self.compressor.average else tensor_aggregated diff --git a/untitled folder/grace_dl/torch/compressor/.DS_Store b/untitled folder/grace_dl/torch/compressor/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0= thr + selected = mask.sum() + + for _ in range(10): + if selected > 1.3 * numel * self.compress_ratio: + thr = 1.3 * thr + elif selected < 0.7 * numel * self.compress_ratio: + thr = 0.7 * thr + else: + break + mask = tensor.abs() >= thr + selected = mask.sum() + + indices, = torch.where(mask) + values = tensor[indices] + + tensor_compressed = values, indices + ctx = shape, mask, numel + return tensor_compressed, ctx + + def decompress(self, tensor_compressed, ctx): + values, indices = tensor_compressed + shape, _, numel = ctx + tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) + tensor_decompressed.scatter_(0, indices, values) + return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/torch/compressor/efsignsgd.py b/untitled folder/grace_dl/torch/compressor/efsignsgd.py new file mode 100644 index 0000000..f981d0f --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/efsignsgd.py @@ -0,0 +1,33 @@ +import torch + +from grace_dl.torch import Compressor + + +class EFSignSGDCompressor(Compressor): + + def __init__(self, lr): + super().__init__(average=False) + self.learning_rate = lr + + def compress(self, tensor, name): + """Encoding and compressing the signs """ + shape = tensor.size() + tensor = tensor.flatten() + + sign_encode = tensor >= 0 + mean = tensor.abs().mean() + tensor_compressed = mean, sign_encode.type(torch.uint8) + + return tensor_compressed, shape + + def decompress(self, tensor_compressed, shape): + """Decoding the signs to float format """ + mean, sign_encode = tensor_compressed + sign_decode = sign_encode.type(torch.float32) * 2 - 1 + sign_decode = mean * sign_decode + tensor_decompressed = sign_decode.view(shape) + return tensor_decompressed + + def aggregate(self, tensors): + """Aggregate a list of tensors.""" + return sum(tensors) / self.learning_rate diff --git a/untitled folder/grace_dl/torch/compressor/fp16.py b/untitled folder/grace_dl/torch/compressor/fp16.py new file mode 100644 index 0000000..5021ae6 --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/fp16.py @@ -0,0 +1,22 @@ +import torch + +from grace_dl.torch import Compressor + + +class FP16Compressor(Compressor): + """Compress all floating point gradients to 16-bit.""" + + def compress(self, tensor): + """Downcasts the tensor to 16-bit.""" + dtype = tensor.dtype + if dtype.is_floating_point: + # Only allow compression from other floating point types + tensor = tensor.type(torch.float16) + return tensor, dtype + + def decompress(self, tensors, dtype): + """Upcasts the tensor to the initialization dtype.""" + tensor_decompressed = tensors + if dtype.is_floating_point: + tensor_decompressed = tensor_decompressed.type(dtype) + return tensor_decompressed diff --git a/untitled folder/grace_dl/torch/compressor/natural.py b/untitled folder/grace_dl/torch/compressor/natural.py new file mode 100644 index 0000000..a208071 --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/natural.py @@ -0,0 +1,40 @@ +import torch +import cupy # conda install -c conda-forge cupy=7.0.0=py37h0c141eb_2 +from torch.utils.dlpack import to_dlpack +from torch.utils.dlpack import from_dlpack + +from grace_dl.torch import Compressor + + +class NaturalCompressor(Compressor): + def __init__(self): + super().__init__() + + def compress(self, tensor, name): + shape = tensor.size() + tensor_flatten = tensor.flatten() + cupy_tensor = cupy.fromDlpack(to_dlpack(tensor_flatten)) + tensor_cast = cupy_tensor.view(cupy.int32) + sign = tensor_cast & cupy.int32(0b10000000000000000000000000000000) + exp = tensor_cast & cupy.int32(0b01111111100000000000000000000000) + mantissa = tensor_cast & cupy.int32(0b00000000011111111111111111111111) + exp_add_one = mantissa > cupy.random.randint(low=0, high=0b00000000011111111111111111111111, + size=cupy_tensor.shape, + dtype=cupy.int32) + exponent = cupy.where(exp_add_one, exp + 0b00000000100000000000000000000000, exp) + exp_shift = cupy.clip(exponent, a_min=0b00001001000000000000000000000000, a_max=0b01001000100000000000000000000000) + exps = cupy.right_shift(exp_shift, 23) + exps = cupy.bitwise_or(cupy.right_shift(sign, 24), exps - 18) + tensor_compressed = exps.astype(cupy.uint8) + return [from_dlpack(tensor_compressed.toDlpack())], shape + + + def decompress(self, tensor_compressed, shape): + tensor_compressed, = tensor_compressed + cupy_tensor = cupy.fromDlpack(to_dlpack(tensor_compressed)) + sign = cupy_tensor > 127 + exps = cupy.bitwise_and(cupy_tensor, 0b01111111) + floats = cupy.left_shift((exps + 18).astype(cupy.int32), 23).view(cupy.float32) + tensor_decompressed = cupy.where(sign, -floats, floats) + tensor_decompressed = cupy.multiply((exps >= 1).astype(cupy.float32), tensor_decompressed) + return from_dlpack(tensor_decompressed.toDlpack()).view(shape) diff --git a/untitled folder/grace_dl/torch/compressor/none.py b/untitled folder/grace_dl/torch/compressor/none.py new file mode 100644 index 0000000..60e5916 --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/none.py @@ -0,0 +1,12 @@ +from grace_dl.torch import Compressor + + +class NoneCompressor(Compressor): + """Default no-op compression.""" + + def compress(self, tensor, name): + return [tensor], None + + def decompress(self, tensors, ctx): + tensor, = tensors + return tensor diff --git a/untitled folder/grace_dl/torch/compressor/onebit.py b/untitled folder/grace_dl/torch/compressor/onebit.py new file mode 100644 index 0000000..9758734 --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/onebit.py @@ -0,0 +1,32 @@ +import torch + +from grace_dl.torch import Compressor + + +class OneBitCompressor(Compressor): + + def compress(self, tensor, name): + shape = tensor.size() + tensor = tensor.flatten() + numel = tensor.numel() + + mask0 = tensor < 0 + sum0 = torch.sum(tensor[mask0]) + num0 = torch.sum(mask0).float() + mean0 = sum0 / num0 if num0 > 0 else sum0 + + mask1 = ~mask0 + sum1 = torch.sum(tensor[mask1]) + num1 = numel - num0 + mean1 = sum1 / num1 if num1 > 0 else sum1 + + tensor_compressed = mask0.type(torch.uint8), mean0, mean1 + + return tensor_compressed, shape + + def decompress(self, tensor_compressed, shape): + mask0, mean0, mean1 = tensor_compressed + mask0= mask0.bool() + tensor_decompressed = mask0 * mean0 + ~mask0 * mean1 + tensor_decompressed = tensor_decompressed.view(shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/torch/compressor/powersgd.py b/untitled folder/grace_dl/torch/compressor/powersgd.py new file mode 100644 index 0000000..9dffca5 --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/powersgd.py @@ -0,0 +1,58 @@ +import torch + +from grace_dl.torch import Compressor +from horovod.torch import allreduce_ + + +@torch.jit.script +def orthogonalize(matrix): + n, m = matrix.shape + for i in range(m): + # Normalize the i'th column + col = matrix[:, i: i + 1] + col /= torch.sqrt(torch.sum(col ** 2)) + # Project it on the rest and remove it + if i + 1 < m: + rest = matrix[:, i + 1:] + # rest -= torch.matmul(col.t(), rest) * col + rest -= torch.sum(col * rest, dim=0) * col + + +class PowerSGDCompressor(Compressor): + """ + PowerSGD: Practical Low-Rank Gradient Compression for Distributed Optimization. + T. Vogels, S. P. Karimireddy, and M. Jaggi. In NeurIPS, 2019. + """ + + def __init__(self): + super().__init__() + self.q_memory = {} + + def compress(self, tensor, name): + if tensor.dim() == 1: + return [tensor], None + + shape = tensor.size() + matrix = tensor.view([shape[0], -1]) + q = self.q_memory[name] + # q, _ = torch.qr(q) + orthogonalize(q) + + p = torch.mm(matrix, q) + p = allreduce_(p) + # p, _ = torch.qr(p) + orthogonalize(p) + q = torch.mm(matrix.t(), p) + q = allreduce_(q) + ctx = p, q, shape + self.q_memory[name] = q + return [], ctx + + def decompress(self, tensors, ctx): + if ctx is None: + tensor, = tensors + return tensor + p, q, tensor_shape = ctx + new_tensor = torch.mm(p, q.t()) + tensor_decompressed = new_tensor.view(tensor_shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/torch/compressor/qsgd.py b/untitled folder/grace_dl/torch/compressor/qsgd.py new file mode 100644 index 0000000..0c08dce --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/qsgd.py @@ -0,0 +1,39 @@ +import torch + +from grace_dl.torch import Compressor + + +class QSGDCompressor(Compressor): + + def __init__(self, quantum_num): + super().__init__() + self.quantum_num = quantum_num + + def compress(self, tensor, name): + shape = tensor.size() + tensor = tensor.flatten() + + norm = tensor.norm() + norm = norm.flatten() + abs_gradient = tensor.abs() + + level_float = self.quantum_num / norm * abs_gradient + previous_level = level_float.floor() + prob = torch.empty_like(tensor).uniform_() + is_next_level = (prob < (level_float - previous_level)).type(torch.float32) + new_level = (previous_level + is_next_level) + + sign = tensor.sign() + tensor_compressed = (new_level * sign).type(torch.int16) + tensor_compressed = tensor_compressed.type(torch.int8 if self.quantum_num < 128 else torch.half) + tensor_compressed = tensor_compressed, norm + + return tensor_compressed, shape + + def decompress(self, tensor_compressed, shape): + tensor_compressed, norm = tensor_compressed + + decode_output = tensor_compressed.type(torch.float32) + tensor_decompressed = norm / self.quantum_num * decode_output + tensor_decompressed = tensor_decompressed.view(shape) + return tensor_decompressed diff --git a/untitled folder/grace_dl/torch/compressor/randomk.py b/untitled folder/grace_dl/torch/compressor/randomk.py new file mode 100644 index 0000000..c345956 --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/randomk.py @@ -0,0 +1,40 @@ +import torch + +from grace.grace_dl.torch import Compressor + + +def sparsify(tensor, compress_ratio): + tensor = tensor.flatten() + numel = tensor.numel() + k = max(1, int(numel * compress_ratio)) + indices = torch.randperm(numel, device=tensor.device)[:k] + values = tensor[indices] + return indices, values + + +class RandomKCompressor(Compressor): + """Python libraries Based Compress by performing sparsification (i.e., sending a ratio of the actual tensor size.""" + + def __init__(self, compress_ratio): + super().__init__() + self.global_step = 0 + self.compress_ratio = compress_ratio + + def compress(self, tensor, name): + """Use Python Random libraries RNG to compress by generating a list of indices to be transmitted.""" + + h = sum(bytes(name, encoding='utf8'), self.global_step) + self.global_step += 1 + torch.manual_seed(h) + indices, values = sparsify(tensor, self.compress_ratio) + + ctx = indices, tensor.numel(), tensor.size() + return [values], ctx + + def decompress(self, tensors, ctx): + """Decompress by filling empty slots with zeros and reshape back using the original shape""" + indices, numel, shape = ctx + values, = tensors + tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) + tensor_decompressed.scatter_(0, indices, values) + return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/torch/compressor/signsgd.py b/untitled folder/grace_dl/torch/compressor/signsgd.py new file mode 100644 index 0000000..53637e1 --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/signsgd.py @@ -0,0 +1,30 @@ +import torch + +from grace_dl.torch import Compressor + + +class SignSGDCompressor(Compressor): + + def __init__(self): + super().__init__(average=False) + + def compress(self, tensor, name): + shape = tensor.size() + tensor = tensor.flatten() + + tensor_compressed = tensor >= 0 + return [tensor_compressed.type(torch.uint8)], shape + + def decompress(self, tensors, shape): + """Decoding the signs to float format """ + sign_encode, = tensors + sign_decode = sign_encode.type(torch.float32) * 2 - 1 + tensor_decompressed = sign_decode.view(shape) + return tensor_decompressed + + def aggregate(self, tensors): + """Aggregate a list of tensors.""" + agged_tensor = sum(tensors) + sign = agged_tensor >= 0 + agged_tensor = sign * 2.0 - 1.0 + return agged_tensor diff --git a/untitled folder/grace_dl/torch/compressor/signum.py b/untitled folder/grace_dl/torch/compressor/signum.py new file mode 100644 index 0000000..5456633 --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/signum.py @@ -0,0 +1,37 @@ +import torch + +from grace_dl.torch import Compressor + + +class SignumCompressor(Compressor): + + def __init__(self, momentum): + super().__init__(average=False) + self.momentum = momentum + self.momentums = {} + + def compress(self, tensor, name): + """Encoding and compressing the signs """ + shape = tensor.size() + tensor = tensor.flatten() + + # update tensor by momentum + if name in self.momentums: + tensor = (1.0 - self.momentum) * tensor + self.momentum * self.momentums[name] + self.momentums[name] = tensor + tensor_compressed = tensor >= 0 + return [tensor_compressed.type(torch.uint8)], shape + + def decompress(self, tensors, shape): + sign_encode, = tensors + """Decoding the signs to float format """ + sign_decode = sign_encode.type(torch.float32) * 2 - 1 + tensor_decompressed = sign_decode.view(shape) + return tensor_decompressed + + def aggregate(self, tensors): + """Aggregate a list of tensors.""" + agged_tensor = sum(tensors) + agged_tensor = agged_tensor >= 0 + agged_tensor = agged_tensor * 2.0 - 1.0 + return agged_tensor diff --git a/untitled folder/grace_dl/torch/compressor/terngrad.py b/untitled folder/grace_dl/torch/compressor/terngrad.py new file mode 100644 index 0000000..cfe159c --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/terngrad.py @@ -0,0 +1,32 @@ +import torch + +from grace_dl.torch import Compressor + + +class TernGradCompressor(Compressor): + + def compress(self, tensor, name): + shape = tensor.size() + tensor = tensor.flatten() + + std = (tensor - torch.mean(tensor)) ** 2 + std = torch.sqrt(torch.mean(std)) + c = 2.5 * std.item() + gradient = torch.clamp(tensor, -c, c) + abs_gradient = gradient.abs() + scalar = abs_gradient.max() + + sign_gradient = gradient.sign() * scalar + rnd_sample = torch.empty_like(tensor).uniform_(0, scalar.item()) + sign_gradient[rnd_sample >= abs_gradient] = 0 + new_sign = sign_gradient.sign() # -1, 0, 1 + + tensor_compressed = new_sign.type(torch.int8), scalar.flatten() + + return tensor_compressed, shape + + def decompress(self, tensor_compressed, shape): + tensor_compressed, scalar = tensor_compressed + sign = tensor_compressed.type(torch.float32) + tensor_decompressed = sign * scalar + return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/torch/compressor/threshold.py b/untitled folder/grace_dl/torch/compressor/threshold.py new file mode 100644 index 0000000..c142d53 --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/threshold.py @@ -0,0 +1,27 @@ +import torch + +from grace_dl.torch import Compressor + + +class ThresholdCompressor(Compressor): + + def __init__(self, threshold): + super().__init__(tensors_size_are_same=False) + self.threshold = threshold + + def compress(self, tensor, name): + shape = tensor.size() + tensor = tensor.flatten() + numel = tensor.numel() + + indices, = torch.where(tensor.abs() > self.threshold) + values = tensor[indices] + ctx = shape, numel + return [values, indices], ctx + + def decompress(self, tensor_compressed, ctx): + shape, numel = ctx + values, indices = tensor_compressed + tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) + tensor_decompressed.scatter_(0, indices, values) + return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/torch/compressor/topk.py b/untitled folder/grace_dl/torch/compressor/topk.py new file mode 100644 index 0000000..9ccfc2d --- /dev/null +++ b/untitled folder/grace_dl/torch/compressor/topk.py @@ -0,0 +1,37 @@ +import torch + +from grace.grace_dl.torch import Compressor + + +def sparsify(tensor, compress_ratio): + tensor = tensor.flatten() + k = max(1, int(tensor.numel() * compress_ratio)) + _, indices = torch.topk(tensor.abs(), k, sorted=False,) + values = torch.gather(tensor, 0, indices) + return values, indices + + +def desparsify(tensors, numel): + values, indices = tensors + # print("values, indices",values, indices,numel) + tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) + tensor_decompressed.scatter_(0, indices, values) + return tensor_decompressed + + +class TopKCompressor(Compressor): + + def __init__(self, compress_ratio): + super().__init__()#tensors_size_are_same=False + self.compress_ratio = compress_ratio + + def compress(self, tensor, name): + tensors = sparsify(tensor, self.compress_ratio) + ctx = tensor.numel(), tensor.size() + return tensors, ctx + + def decompress(self, tensors, ctx): + """Decompress by filling empty slots with zeros and reshape back using the original shape""" + numel, shape = ctx + tensor_decompressed = desparsify(tensors, numel) + return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/torch/helper.py b/untitled folder/grace_dl/torch/helper.py new file mode 100644 index 0000000..0796893 --- /dev/null +++ b/untitled folder/grace_dl/torch/helper.py @@ -0,0 +1,96 @@ +def grace_from_params(params,processset): + import sys + sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") + sys.path.append("/content/grace/") + import horovod.torch as hvd + world_size = processset.size() + # print("world_size",world_size,processset.size()) + comp = params.get('compressor', 'none') + mem = params.get('memory', 'none') + comm = params.get('communicator', 'allreduce') + if comp == 'dgc': + from grace_dl.torch.compressor.dgc import DgcCompressor + compress_ratio = params.get('compress_ratio', 0.3) + compressor = DgcCompressor(compress_ratio) + elif comp == 'efsignsgd': + from grace_dl.torch.compressor.efsignsgd import EFSignSGDCompressor + lr = params.get('lr', 0.1) + compressor = EFSignSGDCompressor(lr) + elif comp == 'fp16': + from grace_dl.torch.compressor.fp16 import FP16Compressor + compressor = FP16Compressor() + elif comp == 'natural': + from grace_dl.torch.compressor.natural import NaturalCompressor + compressor = NaturalCompressor() + elif comp == 'none': + from grace_dl.torch.compressor.none import NoneCompressor + compressor = NoneCompressor() + elif comp == 'onebit': + from grace_dl.torch.compressor.onebit import OneBitCompressor + compressor = OneBitCompressor() + elif comp == 'powersgd': + from grace_dl.torch.compressor.powersgd import PowerSGDCompressor + compressor = PowerSGDCompressor() + elif comp == 'qsgd': + from grace_dl.torch.compressor.qsgd import QSGDCompressor + quantum_num = params.get('quantum_num', 127) + compressor = QSGDCompressor(quantum_num) + elif comp == 'randomk': + from grace_dl.torch.compressor.randomk import RandomKCompressor + compress_ratio = params.get('compress_ratio', 0.3) + compressor = RandomKCompressor(compress_ratio) + elif comp == 'signsgd': + from grace_dl.torch.compressor.signsgd import SignSGDCompressor + compressor = SignSGDCompressor() + elif comp == 'signum': + from grace_dl.torch.compressor.signum import SignumCompressor + momentum = params.get('momentum', 0.9) + compressor = SignumCompressor(momentum) + elif comp == 'terngrad': + from grace_dl.torch.compressor.terngrad import TernGradCompressor + compressor = TernGradCompressor() + elif comp == 'threshold': + from grace_dl.torch.compressor.threshold import ThresholdCompressor + threshold = params.get('threshold', 0.01) + compressor = ThresholdCompressor(threshold) + elif comp == 'topk': + from grace_dl.torch.compressor.topk import TopKCompressor + compress_ratio = params.get('compress_ratio', 0.3) + compressor = TopKCompressor(compress_ratio) + else: + raise NotImplementedError(comp) + + if mem == 'dgc': + from grace_dl.torch.memory.dgc import DgcMemory + momentum = params.get('momentum', 0.9) + gradient_clipping = params.get('gradient_clipping', False) + memory = DgcMemory(momentum, gradient_clipping) + elif mem == 'none': + from grace_dl.torch.memory.none import NoneMemory + memory = NoneMemory() + elif mem == 'powersgd': + from grace_dl.torch.memory.powersgd import PowerSGDMemory + compress_rank = params.get('compress_rank', 1) + memory = PowerSGDMemory(compressor.q_memory, compress_rank) + elif mem == 'residual': + from grace_dl.torch.memory.residual import ResidualMemory + memory = ResidualMemory() + elif mem == 'efsignsgd': + from grace_dl.torch.memory.efsignsgd import EFSignSGDMemory + lr = params.get('lr', 0.1) + memory = EFSignSGDMemory(lr) + else: + raise NotImplementedError(mem) + + if comm == 'allreduce': + from grace_dl.torch.communicator.allreduce import Allreduce + return Allreduce(compressor, memory) + elif comm == 'allgather': + from grace_dl.torch.communicator.allgather import Allgather + # print("compressor",compressor) + return Allgather(compressor, memory, world_size) + elif comm == 'broadcast': + from grace_dl.torch.communicator.broadcast import Broadcast + return Broadcast(compressor, memory, world_size) + else: + raise NotImplementedError(comm) diff --git a/untitled folder/grace_dl/torch/memory/.DS_Store b/untitled folder/grace_dl/torch/memory/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0S5Z-O8O(;SS3VK`cTCpuw5icRu7cim+m717hgE3o@v^|tU?)pN$h|lB9 z?pCzbiwBW11GC>|em2=}%T9+e#@$iaVyw;>vp^9`HVoeg`cappWGrPsuGSbc$--cg z4TE?onhgJu0X(}Z8^F1xpr6i92H<%OhH;eT&E^{~l`CuORjX>%tXpp&CtmL7qs;Zw zYkazpG7jc*Ke&wglU{xER3^C}CH+(-L}4F7ZmyyvloMBuk}y-bo_1KaW%uek)9GI4 zxFOnytyx1%Tg{_(LmV7;W;5H`+TA}n?>)v(iG0>va`@RP*)TYV7nm$;E6ncW06r5# z7F=U5O=6imfM>oSMv)K$!~ij{stlOB&Z@2I25Dc!05R}u2Jn6mpoosaLZiAmU{Eap zUV4+c`Gj5s>Zr;pIhr-p{VSS;( z8Fw_&NDL4IUm3vO52TLu|Lo`fUj@-X3=jjW$pEi(y{-#;GqrVTmso2B=p85u#uXYr lQedJ=F~nji-Ud|yc7YQ>$6%omEFknpK+!-0G4Q7hyaQ+(xSXAMfhz`8(GT2E#oBs+mo0)+a|hqE<8CF}q*6d|oI#?!EVLzwqd`6B+bk6ZvoKUf_%1 z3T7Q|L@xzfWB?gJ2ELyGbMCOY?@xspkO5@ik1-(62M#LHG8k)AM+Y>j1OUu{Sqb!I zEg@sLLCavQ5k^3$4h7Vq)RY)hhl8CPKg(dOQHK+1iVtcsQ&XW(l^y2i%AHV4Beuu@ zGBC+NT0iFH{eSRz{XZFmPsji=@UIx4nVMaz!j{b4I<+}@*K*JcP$|+c*7z0z9d#9h fue^$vL6v}=%MPGrFxChb5d0BPG+=`a{3rvj5fp6x literal 0 HcmV?d00001 diff --git a/untitled folder/patch_files/horovod/torch/.DS_Store b/untitled folder/patch_files/horovod/torch/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 input, output +# We keep input in order to make sure it does not get garbage collected +# before the operation is finished. +_handle_map = {} + + +def _check_function(function_factory, tensor): + function = function_factory(tensor) + if not hasattr(mpi_lib, function): + raise ValueError('Tensor type %s is not supported.' % tensor.type()) + if not tensor.is_contiguous(): + raise ValueError('Tensor is required to be contiguous.') + return function + + +def _allreduce_function_factory(tensor): + return 'horovod_torch_allreduce_async_' + tensor.type().replace('.', '_') + + +def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor, process_set: ProcessSet): + # Set the divisor for reduced gradients to average when necessary + if op == Average: + if rocm_built(): + # For ROCm, perform averaging at framework level + divisor = process_set.size() + op = Sum + else: + divisor = 1 + + elif op == Adasum: + if process_set != global_process_set: + raise NotImplementedError("Adasum does not support non-global process sets yet.") + if tensor.device.type != 'cpu' and gpu_available('torch'): + if nccl_built(): + if not is_homogeneous(): + raise NotImplementedError('Running GPU Adasum on heterogeneous cluster is not supported yet.') + elif not num_rank_is_power_2(int(size() / local_size())): + raise NotImplementedError('Running GPU Adasum with non-power of 2 nodes is not supported yet.') + if rocm_built(): + # For ROCm, perform averaging at framework level + divisor = local_size() + else: + divisor = 1 + else: + warnings.warn('Adasum reduction does not currently support GPU reduction using MPI. Tensors are ' + 'copied to CPU memory instead. To use Adasum for GPU reduction, please compile Horovod ' + 'with HOROVOD_GPU_OPERATIONS=NCCL.') + divisor = 1 + else: + if not num_rank_is_power_2(size()): + raise NotImplementedError('Running Adasum with non-power of 2 ranks is not supported yet.') + divisor = 1 + else: + divisor = 1 + + function = _check_function(_allreduce_function_factory, tensor) + try: + handle = getattr(mpi_lib, function)(tensor, output, divisor, + name.encode() if name is not None else _NULL, op, + prescale_factor, postscale_factor, process_set.process_set_id) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tensor, output) + return handle + + +def allreduce_async(tensor, average=None, name=None, op=None, + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): + """ + A function that performs asynchronous averaging or summation of the input tensor + over all the Horovod processes. The input tensor is not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensor: A tensor to reduce. + average: + .. warning:: .. deprecated:: 0.19.0 + + Use `op` instead. Will be removed in v1.0. + + name: A name of the reduction operation. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. + prescale_factor: Multiplicative factor to scale tensor before allreduce. + postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A handle to the allreduce operation that can be used with `poll()` or + `synchronize()`. + """ + op = handle_average_backwards_compatibility(op, average) + output = tensor.new(tensor.shape) + return _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor, process_set) + + +class HorovodAllreduce(torch.autograd.Function): + """An autograd function that performs allreduce on a tensor.""" + + @staticmethod + def forward(ctx, tensor, average, name, op, prescale_factor, postscale_factor, process_set): + ctx.average = average + ctx.op = op + ctx.prescale_factor = prescale_factor + ctx.postscale_factor = postscale_factor + ctx.process_set = process_set + handle = allreduce_async(tensor, average, name, op, prescale_factor, postscale_factor, process_set) + return synchronize(handle) + + @staticmethod + def backward(ctx, grad_output): + return allreduce(grad_output, average=ctx.average, op=ctx.op, + prescale_factor=ctx.prescale_factor, + postscale_factor=ctx.postscale_factor, + process_set=ctx.process_set), None, None, None, None, None, None + + +def allreduce(tensor, average=None, name=None, compression=Compression.none, op=None, + prescale_factor=1.0, postscale_factor=1.0, process_set=global_process_set): + """ + A function that performs averaging or summation of the input tensor over all the + Horovod processes. The input tensor is not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + This acts as a thin wrapper around an autograd function. If your input + tensor requires gradients, then callings this function will allow gradients + to be computed and backpropagated. + + Arguments: + tensor: A tensor to reduce. + average: + .. warning:: .. deprecated:: 0.19.0 + + Use `op` instead. Will be removed in v1.0. + + name: A name of the reduction operation. + compression: Compression algorithm used during allreduce to reduce the amount + of data sent during the each parameter update step. Defaults to + not using compression. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. + prescale_factor: Multiplicative factor to scale tensor before allreduce. + postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A tensor of the same shape and type as `tensor`, averaged or summed across all + processes. + """ + tensor_compressed, ctx = compression.compress(tensor) + summed_tensor_compressed = HorovodAllreduce.apply(tensor_compressed, average, name, op, + prescale_factor, postscale_factor, + process_set) + return compression.decompress(summed_tensor_compressed, ctx) + + +def allreduce_async_(tensor, average=None, name=None, op=None, + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): + """ + A function that performs asynchronous in-place averaging or summation of the input + tensor over all the Horovod processes. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensor: A tensor to reduce. + average: + .. warning:: .. deprecated:: 0.19.0 + + Use `op` instead. Will be removed in v1.0. + + name: A name of the reduction operation. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. + prescale_factor: Multiplicative factor to scale tensor before allreduce. + postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A handle to the allreduce operation that can be used with `poll()` or + `synchronize()`. + """ + op = handle_average_backwards_compatibility(op, average) + return _allreduce_async(tensor, tensor, name, op, prescale_factor, postscale_factor, process_set) + + +def allreduce_(tensor, average=None, name=None, op=None, + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): + """ + A function that performs in-place averaging or summation of the input tensor over + all the Horovod processes. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensor: A tensor to reduce. + average: + .. warning:: .. deprecated:: 0.19.0 + + Use `op` instead. Will be removed in v1.0. + + name: A name of the reduction operation. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. + prescale_factor: Multiplicative factor to scale tensor before allreduce. + postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A tensor of the same shape and type as `tensor`, averaged or summed across all + processes. + """ + handle = allreduce_async_(tensor, average, name, op, prescale_factor, postscale_factor, process_set) + return synchronize(handle) + + +def _grouped_allreduce_function_factory(tensor): + return 'horovod_torch_grouped_allreduce_async_' + tensor.type().replace('.', '_') + + +def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor, process_set: ProcessSet): + # Set the divisor for reduced gradients to average when necessary + if op == Average: + if rocm_built(): + # For ROCm, perform averaging at framework level + divisor = process_set.size() + op = Sum + else: + divisor = 1 + elif op == Adasum: + if process_set != global_process_set: + raise NotImplementedError("Adasum does not support non-global process sets yet.") + if tensors[0].device.type != 'cpu' and gpu_available('torch'): + if nccl_built(): + if not is_homogeneous(): + raise NotImplementedError('Running GPU Adasum on heterogeneous cluster is not supported yet.') + elif not num_rank_is_power_2(int(size() / local_size())): + raise NotImplementedError('Running GPU Adasum with non-power of 2 nodes is not supported yet.') + if rocm_built(): + # For ROCm, perform averaging at framework level + divisor = local_size() + else: + divisor = 1 + else: + warnings.warn('Adasum reduction does not currently support GPU reduction using MPI. Tensors are ' + 'copied to CPU memory instead. To use Adasum for GPU reduction, please compile Horovod ' + 'with HOROVOD_GPU_OPERATIONS=NCCL.') + divisor = 1 + else: + if not num_rank_is_power_2(size()): + raise NotImplementedError('Running Adasum with non-power of 2 ranks is not supported yet.') + divisor = 1 + else: + divisor = 1 + + function = _check_function(_grouped_allreduce_function_factory, tensors[0]) + try: + handle = getattr(mpi_lib, function)(tensors, outputs, divisor, + name.encode() if name is not None else _NULL, op, + prescale_factor, postscale_factor, process_set.process_set_id) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tuple(tensors), tuple(outputs)) + return handle + + +def grouped_allreduce_async(tensors, average=None, name=None, op=None, + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): + """ + A function that performs asynchronous averaging or summation of the input tensor + list over all the Horovod processes. The input tensors are not modified. + + The reduction operations are keyed by the base name. If a base name is not + provided, an incremented auto-generated base name is used. Reductions are + performed across tensors in the same list position. The tensor type and + shape must be the same on all Horovod processes for tensors sharing + positions in the input tensor list. The reduction will not start until all + processes are ready to send and receive the tensors. + + Arguments: + tensors: A list of tensors to reduce. + average: + .. warning:: .. deprecated:: 0.19.0 + + Use `op` instead. Will be removed in v1.0. + + name: A base name to use for the group reduction operation. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. + prescale_factor: Multiplicative factor to scale tensor before allreduce. + postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A handle to the group allreduce operation that can be used with `poll()` or + `synchronize()`. + """ + op = handle_average_backwards_compatibility(op, average) + outputs = [t.new(t.shape) for t in tensors] + return _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor, process_set) + + +class HorovodGroupedAllreduce(torch.autograd.Function): + """An autograd function that performs allreduce on a list of tensors.""" + + @staticmethod + def forward(ctx, average, name, op, prescale_factor, postscale_factor, process_set: ProcessSet, *tensors): + ctx.average = average + ctx.op = op + ctx.prescale_factor = prescale_factor + ctx.postscale_factor = postscale_factor + ctx.process_set = process_set + handle = grouped_allreduce_async(list(tensors), average, name, op, prescale_factor, postscale_factor, + process_set) + return synchronize(handle) + + @staticmethod + def backward(ctx, *grad_output): + grad_reduced = grouped_allreduce(list(grad_output), average=ctx.average, op=ctx.op, + prescale_factor=ctx.prescale_factor, + postscale_factor=ctx.postscale_factor, + process_set=ctx.process_set) + return (None, None, None, None, None, None, *grad_reduced) + + +def grouped_allreduce(tensors, average=None, name=None, compression=Compression.none, op=None, + prescale_factor=1.0, postscale_factor=1.0, process_set=global_process_set): + """ + A function that performs averaging or summation of the input tensor + list over all the Horovod processes. The input tensors are not modified. + + The reduction operations are keyed by the base name. If a base name is not + provided, an incremented auto-generated base name is used. Reductions are + performed across tensors in the same list position. The tensor type and + shape must be the same on all Horovod processes for tensors sharing + positions in the input tensor list. The reduction will not start until all + processes are ready to send and receive the tensors. + + This acts as a thin wrapper around an autograd function. If your input + tensors require gradients, then calling this function will allow gradients + to be computed and backpropagated. + + Arguments: + tensors: A list of tensors to reduce. + average: + .. warning:: .. deprecated:: 0.19.0 + + Use `op` instead. Will be removed in v1.0. + + name: A base name to use for the group reduction operation. + compression: Compression algorithm used during allreduce to reduce the amount + of data sent during the each parameter update step. Defaults to + not using compression. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. + prescale_factor: Multiplicative factor to scale tensor before allreduce. + postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A list containing tensors of the same shape and type as in `tensors`, + averaged or summed across all processes. + """ + tensors_compressed, ctxs = zip(*[compression.compress(t) for t in tensors]) + summed_tensors_compressed = HorovodGroupedAllreduce.apply(average, name, op, + prescale_factor, postscale_factor, + process_set, *tensors_compressed) + return [compression.decompress(t, ctx) for t, ctx in zip(summed_tensors_compressed, ctxs)] + + +def grouped_allreduce_async_(tensors, average=None, name=None, op=None, + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): + """ + A function that performs asynchronous in-place averaging or summation of the input + tensors over all the Horovod processes. + + The reduction operations are keyed by the base name. If a base name is not + provided, an incremented auto-generated base name is used. Reductions are + performed across tensors in the same list position. The tensor type and + shape must be the same on all Horovod processes for tensors sharing + positions in the input tensor list. The reduction will not start until all + processes are ready to send and receive the tensors. + + Arguments: + tensors: A list of tensors to reduce. + average: + .. warning:: .. deprecated:: 0.19.0 + + Use `op` instead. Will be removed in v1.0. + + name: A base name to use for the group reduction operation. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. + prescale_factor: Multiplicative factor to scale tensor before allreduce. + postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A handle to the group allreduce operation that can be used with `poll()` or + `synchronize()`. + """ + op = handle_average_backwards_compatibility(op, average) + return _grouped_allreduce_async(tensors, tensors, name, op, prescale_factor, postscale_factor, process_set) + + +def grouped_allreduce_(tensors, average=None, name=None, op=None, + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): + """ + A function that performs in-place averaging or summation of the input tensors over + all the Horovod processes. + + The reduction operations are keyed by the base name. If a base name is not + provided, an incremented auto-generated base name is used. Reductions are + performed across tensors in the same list position. The tensor type and + shape must be the same on all Horovod processes for tensors sharing + positions in the input tensor list. The reduction will not start until all + processes are ready to send and receive the tensors. + + Arguments: + tensors: A list of tensors to reduce. + average: + .. warning:: .. deprecated:: 0.19.0 + + Use `op` instead. Will be removed in v1.0. + + name: A base name to use for the group reduction operation. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. + prescale_factor: Multiplicative factor to scale tensor before allreduce. + postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A list containing tensors of the same shape and type as in `tensors`, + averaged or summed across all processes. + """ + handle = grouped_allreduce_async_(tensors, average, name, op, prescale_factor, postscale_factor, process_set) + return synchronize(handle) + + +def sparse_allreduce_async(tensor, name, op, process_set=global_process_set): + # Allgather aggregates along the first dimension, so we need to transpose the + # indices to enforce correct concatenation behavior, then transpose back prior to + # constructing the new aggregated sparse gradient + t = tensor + indices_handle = allgather_async(t._indices().transpose(0, 1).contiguous(), name=f'{name}.indices', + process_set=process_set) + values_handle = allgather_async(t._values(), name=f'{name}.values', process_set=process_set) + + def handle(): + # We need to sync values handle firstly for torch nightly >= 10.0 + # Issue: https://github.com/horovod/horovod/issues/2961 + values = synchronize(values_handle) + indices = synchronize(indices_handle) + + values = (values / process_set.size()) if op == Average else values + + if indices.dim() == 0 or values.dim() == 0: + return t.new().resize_as_(t) + return t.new(indices.transpose(0, 1), values, t.size()) + + return handle + + +def _allgather_function_factory(tensor): + return 'horovod_torch_allgather_async_' + tensor.type().replace('.', '_') + + +def _allgather_async(tensor, output, name, process_set: ProcessSet): + function = _check_function(_allgather_function_factory, tensor) + try: + handle = getattr(mpi_lib, function)( + tensor, output, name.encode() if name is not None else _NULL, + process_set.process_set_id) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tensor, output) + return handle + + +def allgather_async(tensor, name=None, process_set=global_process_set): + """ + A function that asynchronously concatenates the input tensor with the same input + tensor on all other Horovod processes. The input tensor is not modified. + + The concatenation is done on the first dimension, so the input tensors on the + different processes must have the same rank and shape, except for the first + dimension, which is allowed to be different. + + Arguments: + tensor: A tensor to allgather. + name: A name of the allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A handle to the allgather operation that can be used with `poll()` or + `synchronize()`. + """ + output = tensor.new() + return _allgather_async(tensor, output, name, process_set) + + +class HorovodAllgather(torch.autograd.Function): + """An autograd function that performs allgather on a tensor.""" + + @staticmethod + def forward(ctx, tensor, name, process_set: ProcessSet): + ctx.dim = tensor.shape[0] + ctx.process_set = process_set + handle = allgather_async(tensor, name, process_set) + return synchronize(handle) + + @staticmethod + def backward(ctx, grad_output): + grad_reduced = allreduce(grad_output, average=False, process_set=ctx.process_set)#average CHANGE + + dim_t = torch.IntTensor([ctx.dim]) + dim = allgather(dim_t, process_set=ctx.process_set).view(ctx.process_set.size()) + + r = ctx.process_set.rank() + offset = torch.sum(dim.narrow(0, 0, r)).item() if r != 0 else 0 + return grad_reduced.narrow(0, offset, ctx.dim), None, None + + +def allgather(tensor, name=None, process_set=global_process_set): + """ + A function that concatenates the input tensor with the same input tensor on + all other Horovod processes. The input tensor is not modified. + + The concatenation is done on the first dimension, so the corresponding + input tensors on the different processes must have the same rank and + shape, except for the first dimension, which is allowed to be different. + + This acts as a thin wrapper around an autograd function. If your input + tensor requires gradients, then callings this function will allow gradients + to be computed and backpropagated. + + Arguments: + tensor: A tensor to allgather. + name: A name of the allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A tensor of the same type as `tensor`, concatenated on dimension zero + across all processes. The shape is identical to the input shape, except for + the first dimension, which may be greater and is the sum of all first + dimensions of the tensors in different Horovod processes. + """ + return HorovodAllgather.apply(tensor, name, process_set) + + +def _grouped_allgather_function_factory(tensor): + return 'horovod_torch_grouped_allgather_async_' + tensor.type().replace('.', '_') + + +def _grouped_allgather_async(tensors, outputs, name, process_set: ProcessSet): + function = _check_function(_grouped_allgather_function_factory, tensors[0]) + try: + handle = getattr(mpi_lib, function)( + tensors, outputs, name.encode() if name is not None else _NULL, + process_set.process_set_id) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tuple(tensors), tuple(outputs)) + return handle + + +def grouped_allgather_async(tensors, name=None, process_set=global_process_set): + """ + A function that asynchronously concatenates each input tensor with the corresponding + input tensor on all other Horovod processes for a list of input tensors. The input + tensors are not modified. + + The concatenation is done on the first dimension, so the input tensors on the + different processes must have the same rank and shape, except for the first + dimension, which is allowed to be different. + + Arguments: + tensors: A list of tensors to allgather. + name: A base name to use for the group allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A handle to the group allgather operation that can be used with `poll()` or + `synchronize()`. + """ + outputs = [t.new() for t in tensors] + return _grouped_allgather_async(tensors, outputs, name, process_set) + + +class HorovodGroupedAllgather(torch.autograd.Function): + """An autograd function that performs allgather on a list of tensors.""" + + @staticmethod + def forward(ctx, name, process_set: ProcessSet, *tensors): + ctx.dims = [t.shape[0] for t in tensors] + ctx.process_set = process_set + handle = grouped_allgather_async(list(tensors), name, process_set) + return synchronize(handle) + + @staticmethod + def backward(ctx, *grad_output): + grad_reduced = grouped_allreduce(list(grad_output), average=True, process_set=ctx.process_set) + + dim_ts = [torch.IntTensor([dim]) for dim in ctx.dims] + dims = [dt.view(ctx.process_set.size()) + for dt in grouped_allgather(dim_ts, process_set=ctx.process_set)] + + r = ctx.process_set.rank() + offsets = [torch.sum(dim.narrow(0, 0, r)).item() if r != 0 else 0 + for dim in dims] + result = [gr.narrow(0, offset, dim) + for gr, offset, dim in zip(grad_reduced, offsets, ctx.dims)] + return (None, None, *result) + + +def grouped_allgather(tensors, name=None, process_set=global_process_set): + """ + A function that concatenates each input tensor with the corresponding + input tensor on all other Horovod processes for a list of input tensors. + The input tensors are not modified. + + The concatenation is done on the first dimension, so the corresponding input + tensors on the different processes must have the same rank and shape, except + for the first dimension, which is allowed to be different. + + Arguments: + tensors: A list of tensors to allgather. + name: A base name to use for the group allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A list containing tensors of the same type as in `tensors`. Each tensor + is concatenated on dimension zero across all processes. Its shape is + identical to the corresponding input shape, expect for the first + dimension, which may be greater and is the sum of all first dimensions + of the corresponding tensor in different Horovod processes. + """ + return HorovodGroupedAllgather.apply(name, process_set, *tensors) + + +def _broadcast_function_factory(tensor): + return 'horovod_torch_broadcast_async_' + tensor.type().replace('.', '_') + + +def _broadcast_async(tensor, output, root_rank, name, process_set: ProcessSet): + function = _check_function(_broadcast_function_factory, tensor) + try: + handle = getattr(mpi_lib, function)( + tensor, output, root_rank, name.encode() if name is not None else _NULL, + process_set.process_set_id) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tensor, output) + return handle + + +def broadcast_async(tensor, root_rank, name=None, process_set=global_process_set): + """ + A function that asynchronously broadcasts the input tensor on root rank to the same + input tensor on all other Horovod processes. The input tensor is not modified. + + The broadcast operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The broadcast will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensor: A tensor to broadcast. + root_rank: The rank to broadcast the value from. + name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A handle to the broadcast operation that can be used with `poll()` or + `synchronize()`. + """ + output = tensor.new(tensor.shape) + return _broadcast_async(tensor, output, root_rank, name, process_set) + + +class HorovodBroadcast(torch.autograd.Function): + """An autograd function that broadcasts a tensor.""" + + @staticmethod + def forward(ctx, tensor, root_rank, name, process_set: ProcessSet): + ctx.root_rank = root_rank + ctx.process_set = process_set + handle = broadcast_async(tensor, root_rank, name, process_set) + return synchronize(handle) + + @staticmethod + def backward(ctx, grad_output): + grad_reduced = allreduce(grad_output, average=False, process_set=ctx.process_set)#True CHANGED + if rank() != ctx.root_rank: + grad_reduced *= 0 + return grad_reduced, None, None, None + + +def broadcast(tensor, root_rank, name=None, process_set=global_process_set): + """ + A function that broadcasts the input tensor on root rank to the same input tensor + on all other Horovod processes. The input tensor is not modified. + + The broadcast operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The broadcast will not start until all processes + are ready to send and receive the tensor. + + This acts as a thin wrapper around an autograd function. If your input + tensor requires gradients, then callings this function will allow gradients + to be computed and backpropagated. + + Arguments: + tensor: A tensor to broadcast. + root_rank: The rank to broadcast the value from. + name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A tensor of the same shape and type as `tensor`, with the value broadcasted + from root rank. + """ + return HorovodBroadcast.apply(tensor, root_rank, name, process_set) + + +def broadcast_async_(tensor, root_rank, name=None, process_set=global_process_set): + """ + A function that asynchronously broadcasts the input tensor on root rank to the same + input tensor on all other Horovod processes. The operation is performed in-place. + + The broadcast operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The broadcast will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensor: A tensor to broadcast. + root_rank: The rank to broadcast the value from. + name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A handle to the broadcast operation that can be used with `poll()` or + `synchronize()`. + """ + return _broadcast_async(tensor, tensor, root_rank, name, process_set) + + +def broadcast_(tensor, root_rank, name=None, process_set=global_process_set): + """ + A function that broadcasts the input tensor on root rank to the same input tensor + on all other Horovod processes. The operation is performed in-place. + + The broadcast operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The broadcast will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensor: A tensor to broadcast. + root_rank: The rank to broadcast the value from. + name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A tensor of the same shape and type as `tensor`, with the value broadcasted + from root rank. + """ + handle = broadcast_async_(tensor, root_rank, name, process_set) + return synchronize(handle) + +def _alltoall_function_factory(tensor): + return 'horovod_torch_alltoall_async_' + tensor.type().replace('.', '_') + +def _alltoall_async(tensor, splits, output, output_received_splits, name, process_set: ProcessSet): + if splits is None: + # If splits not provided, create empty tensor as placeholder + splits = torch.tensor([], dtype=torch.int32, device='cpu') + elif not isinstance(splits, torch.Tensor): + splits = torch.tensor(splits, dtype=torch.int32, device='cpu') + function = _check_function(_alltoall_function_factory, tensor) + try: + handle = getattr(mpi_lib, function)( + tensor, splits, output, output_received_splits, name.encode() if name is not None else _NULL, + process_set.process_set_id) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tensor, splits, (output, output_received_splits)) + return handle + + +def alltoall_async(tensor, splits=None, name=None, process_set=global_process_set): + """ + A function that scatters slices of the input tensor to all other Horovod processes + and returns a tensor of gathered slices from all other Horovod processes. The input + tensor is not modified. + + The slicing is done on the first dimension, so the input tensors on + the different processes must have the same rank and shape, except for the + first dimension, which is allowed to be different. + + Arguments: + tensor: A tensor to distribute with alltoall. + splits: A tensor of integers in rank order describing how many + elements in `tensor` to send to each worker. Splitting is + applied along the first dimension of `tensor`. If `splits` is + not provided, the first dimension is split equally by the + number of Horovod processes. + name: A name of the alltoall operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A handle to the alltoall operation that can be used with `poll()` or + `synchronize()`. + """ + output = tensor.new() + if isinstance(splits, torch.Tensor): + output_received_splits = splits.new() + else: + output_received_splits = torch.empty(size(), dtype=torch.int32, device='cpu') + return _alltoall_async(tensor, splits, output, output_received_splits, name, process_set) + + +class HorovodAlltoall(torch.autograd.Function): + """An autograd function that performs alltoall on a tensor.""" + + @staticmethod + def forward(ctx, tensor, splits, name, process_set: ProcessSet): + handle = alltoall_async(tensor, splits, name, process_set) + output, received_splits = synchronize(handle) + + ctx.process_set = process_set + ctx.recvsplits = received_splits + if splits is None: + return output + else: + ctx.mark_non_differentiable(received_splits) + return output, received_splits + + @staticmethod + def backward(ctx, grad_output, *dead_gradients): + grad_wrt_tensor, _ = alltoall(grad_output, splits=ctx.recvsplits, + process_set=ctx.process_set) + return grad_wrt_tensor, None, None, None + + +def alltoall(tensor, splits=None, name=None, process_set=global_process_set): + """ + A function that scatters slices of the input tensor to all other Horovod processes + and returns a tensor of gathered slices from all other Horovod processes. The input + tensor is not modified. + + The slicing is done on the first dimension, so the input tensors on + the different processes must have the same rank and shape, except for the + first dimension, which is allowed to be different. + + This acts as a thin wrapper around an autograd function. If your input + tensor requires gradients, then callings this function will allow gradients + to be computed and backpropagated. + + Arguments: + tensor: A tensor to distribute with alltoall. + splits: A tensor of integers in rank order describing how many + elements in `tensor` to send to each worker. Splitting is + applied along the first dimension of `tensor`. If `splits` is + not provided, the first dimension is split equally by the + number of Horovod processes. + name: A name of the alltoall operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + 1) A tensor containing the gathered tensor data from all workers. + 2) If `splits` has been provided: A tensor of integers in rank order + describing how many elements in the output tensor have been received + from each worker. + """ + return HorovodAlltoall.apply(tensor, splits, name, process_set) + + +def _reducescatter_function_factory(tensor): + return 'horovod_torch_reducescatter_async_' + tensor.type().replace('.', '_') + + +def _reducescatter_async(tensor, output, name, op, process_set: ProcessSet, + prescale_factor, postscale_factor): + function = _check_function(_reducescatter_function_factory, tensor) + try: + handle = getattr(mpi_lib, function)(tensor, output, + name.encode() if name is not None else _NULL, + op, process_set.process_set_id, + prescale_factor, postscale_factor) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tensor, output) + return handle + + +def reducescatter_async(tensor, name=None, op=Average, process_set=global_process_set, + prescale_factor=1.0, postscale_factor=1.0): + """ + A function that performs asynchronous reduction of the input tensor over all the + Horovod processes, then scatters the results across all Horovod processes. The + input tensor is not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + The input tensors on the different processes must have the same rank and shape. The + output tensor will be the same rank on all processes, but the first dimension may + be different. + + Arguments: + tensor: A tensor to average and sum. + name: A name of the reduction operation. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensor before reducescatter. + postscale_factor: Multiplicative factor to scale tensor after reducescatter. + + Returns: + A handle to the reducescatter operation that can be used with `poll()` or + `synchronize()`. + """ + output = tensor.new() + return _reducescatter_async(tensor, output, name, op, process_set, + prescale_factor, postscale_factor) + + +class HorovodReducescatter(torch.autograd.Function): + """An autograd function that performs reducescatter on a tensor.""" + + @staticmethod + def forward(ctx, tensor, name, op, process_set, prescale_factor, postscale_factor): + ctx.op = op + ctx.process_set = process_set + ctx.prescale_factor = prescale_factor + ctx.postscale_factor = postscale_factor + handle = reducescatter_async(tensor, name, op, process_set, prescale_factor, postscale_factor) + return synchronize(handle) + + @staticmethod + def backward(ctx, grad_output): + if ctx.op == Sum: + grad_output *= ctx.process_set.size() + if ctx.prescale_factor != 1.0: + grad_output *= ctx.prescale_factor + if ctx.postscale_factor != 1.0: + grad_output *= ctx.postscale_factor + + return allgather(grad_output, process_set=ctx.process_set), None, None, None, None, None + + +def reducescatter(tensor, name=None, compression=Compression.none, op=Average, + process_set=global_process_set, prescale_factor=1.0, postscale_factor=1.0): + """ + A function that performs reduction of the input tensor over all the Horovod + processes, then scatters the results across all Horovod processes. The input tensor + is not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensor: A tensor to average/sum and scatter. + name: A name of the reduction operation. + compression: Compression algorithm used during reducescatter to reduce the amount + of data sent during the each parameter update step. Defaults to + not using compression. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensor before reducescatter. + postscale_factor: Multiplicative factor to scale tensor after reducescatter. + + Returns: + A tensor of the same rank and type as `tensor` across all processes. The shape + is identical to the input shape, except for the first dimension, which will be + divided across the different Horovod processes. + """ + tensor_compressed, ctx = compression.compress(tensor) + reduced_tensor_compressed = HorovodReducescatter.apply(tensor_compressed, name, op, process_set, + prescale_factor, postscale_factor) + return compression.decompress(reduced_tensor_compressed, ctx) + + +def _grouped_reducescatter_function_factory(tensor): + return 'horovod_torch_grouped_reducescatter_async_' + tensor.type().replace('.', '_') + + +def _grouped_reducescatter_async(tensors, outputs, name, op, process_set: ProcessSet, + prescale_factor, postscale_factor): + function = _check_function(_grouped_reducescatter_function_factory, tensors[0]) + try: + handle = getattr(mpi_lib, function)(tensors, outputs, + name.encode() if name is not None else _NULL, + op, process_set.process_set_id, + prescale_factor, postscale_factor) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tuple(tensors), tuple(outputs)) + return handle + + +def grouped_reducescatter_async(tensors, name=None, op=Average, process_set=global_process_set, + prescale_factor=1.0, postscale_factor=1.0): + """ + A function that performs asynchronous reduction of a list of input tensors over all the + Horovod processes, then scatters the results across all Horovod processes. The + input tensors are not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. For each of the input tensors, the type and shape must + be the same on all Horovod processes for a given name. The reduction will not start + until all processes are ready to send and receive the tensors. + + The input tensor at some place in the list tensor must have the same rank and shape + on the different processes. The corresponding output tensor will be the same rank on + all processes, but the first dimension may be different. + + Arguments: + tensors: A list of tensors to average and sum. + name: A base name to use for the group reduction operation. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensors before reducescatter. + postscale_factor: Multiplicative factor to scale tensors after reducescatter. + + Returns: + A handle to the group reducescatter operation that can be used with `poll()` or + `synchronize()`. + """ + outputs = [t.new() for t in tensors] + return _grouped_reducescatter_async(tensors, outputs, name, op, process_set, + prescale_factor, postscale_factor) + + +class HorovodGroupedReducescatter(torch.autograd.Function): + """An autograd function that performs reducescatter on a list of tensors.""" + + @staticmethod + def forward(ctx, name, op, process_set, prescale_factor, postscale_factor, *tensors): + ctx.op = op + ctx.process_set = process_set + ctx.prescale_factor = prescale_factor + ctx.postscale_factor = postscale_factor + handle = grouped_reducescatter_async(list(tensors), name, op, process_set, + prescale_factor, postscale_factor, ) + return synchronize(handle) + + @staticmethod + def backward(ctx, *grad_output): + if ctx.op == Sum: + grad_output = [g * ctx.process_set.size() for g in grad_output] + if ctx.prescale_factor != 1.0: + grad_output = [ctx.prescale_factor * g for g in grad_output] + if ctx.postscale_factor != 1.0: + grad_output = [ctx.postscale_factor * g for g in grad_output] + + return (None, None, None, None, None, + *grouped_allgather(grad_output, process_set=ctx.process_set)) + + +def grouped_reducescatter(tensors, name=None, compression=Compression.none, op=Average, + process_set=global_process_set, prescale_factor=1.0, postscale_factor=1.0): + """ + A function that performs reduction of a list of input tensors over all the + Horovod processes, then scatters the results across all Horovod processes. The + input tensors are not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensors: A list of tensors to average and sum. + name: A base name to use for the group reduction operation. + compression: Compression algorithm used during reducescatter to reduce the amount + of data sent during the each parameter update step. Defaults to + not using compression. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensors before reducescatter. + postscale_factor: Multiplicative factor to scale tensors after reducescatter. + + + Returns: + A list containing tensors of the same rank and type as in `tensors`. For each + tensor the shape is identical to the input shape, except for the first dimension, + which will be divided across the different Horovod processes. + """ + tensors_compressed = [] + ctxs = [] + for tensor in tensors: + tensor_compressed, ctx = compression.compress(tensor) + tensors_compressed.append(tensor_compressed) + ctxs.append(ctx) + reduced_tensors_compressed = HorovodGroupedReducescatter.apply(name, op, process_set, + prescale_factor, postscale_factor, + *tensors_compressed) + return [compression.decompress(reduced_tensor_compressed, ctx) + for reduced_tensor_compressed, ctx in zip(reduced_tensors_compressed, ctxs)] + + +def poll(handle): + """ + Polls an allreduce, allgather, alltoall, broadcast, or reducescatter handle to determine whether + underlying asynchronous operation has completed. After `poll()` returns `True`, + `synchronize()` will return without blocking. + + Arguments: + handle: A handle returned by an allreduce, allgather, alltoall, broadcast, or reducescatter + asynchronous operation. + + Returns: + A flag indicating whether the operation has completed. + """ + return mpi_lib.horovod_torch_poll(handle) != 0 + + +def synchronize(handle): + """ + Synchronizes an asynchronous allreduce, allgather, alltoall, broadcast, or reducescatter operation + until it's completed. Returns the result of the operation. + + Arguments: + handle: A handle returned by an allreduce, allgather, alltoall, broadcast, or reducescatter + asynchronous operation. + + Returns: + A single output tensor of the operation or a tuple of multiple output tensors. + """ + if handle not in _handle_map: + return + + try: + mpi_lib.horovod_torch_wait_and_clear(handle) + output = _handle_map.pop(handle)[-1] + return output + except RuntimeError as e: + _handle_map.pop(handle, None) + raise HorovodInternalError(e) + + +def join(device=-1) -> int: + """A function that indicates that the rank finished processing data. + + All ranks that did not call join() continue to process allreduce operations. + This function blocks Python thread until all ranks join. + + Arguments: + device: An id of the device to create temprorary zero tensors (default -1, CPU) + + Returns: + Id of the rank that joined last. + """ + output = torch.tensor(-1, dtype=torch.int, device=torch.device("cpu")) + try: + handle = mpi_lib.horovod_torch_join(output, device) + except RuntimeError as e: + raise HorovodInternalError(e) + + _handle_map[handle] = (None, output) + + return synchronize(handle).item() + +def barrier(process_set=global_process_set): + """ + A function that acts as a simple sychronization point for ranks specified + in the given process group(default to global group). Ranks that reach + this function call will stall until all other ranks have reached. + + Arguments: + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + """ + + try: + handle = mpi_lib.horovod_torch_barrier(process_set.process_set_id) + except RuntimeError as e: + raise HorovodInternalError(e) + + _handle_map[handle] = (None, None) + + synchronize(handle) diff --git a/untitled folder/patch_files/horovod/torch/optimizer.py b/untitled folder/patch_files/horovod/torch/optimizer.py new file mode 100644 index 0000000..7e3f3b1 --- /dev/null +++ b/untitled folder/patch_files/horovod/torch/optimizer.py @@ -0,0 +1,618 @@ +# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. +# Modifications copyright Microsoft +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import warnings + +from contextlib import contextmanager + +import torch + +from horovod.common.util import split_list + +from horovod.torch.compression import Compression +from horovod.torch.functions import broadcast_object +from horovod.torch.mpi_ops import allreduce_async_, grouped_allreduce_async_, sparse_allreduce_async +from horovod.torch.mpi_ops import synchronize +from horovod.torch.mpi_ops import size +from horovod.torch.mpi_ops import Average, Adasum, Sum +from horovod.torch.mpi_ops import rocm_built +from horovod.torch.mpi_ops import ProcessSet, global_process_set + + +class _DistributedOptimizer(torch.optim.Optimizer): + def __init__(self, params, named_parameters,grace, compression, + backward_passes_per_step=1, op=Average, + gradient_predivide_factor=1.0, + groups=None, + sparse_as_dense=False, + process_set=global_process_set): + super(self.__class__, self).__init__(params) + self._grace = grace + self._process_set=process_set + self._compression = compression + + if named_parameters is not None: + named_parameters = list(named_parameters) + else: + named_parameters = [(f'allreduce.noname.{i}.{j}', v) + for i, param_group in enumerate(self.param_groups) + for j, v in enumerate(param_group['params'])] + # make sure that named_parameters are tuples + if any([not isinstance(p, tuple) for p in named_parameters]): + raise ValueError('named_parameters should be a sequence of ' + 'tuples (name, parameter), usually produced by ' + 'model.named_parameters().') + + dups = _DistributedOptimizer.find_duplicates([k for k, _ in named_parameters]) + if len(dups) > 0: + raise ValueError('Parameter names in named_parameters must be unique. ' + 'Found duplicates: %s' % ', '.join(dups)) + + all_param_ids = {id(v) + for param_group in self.param_groups + for v in param_group['params']} + named_param_ids = {id(v) for k, v in named_parameters} + unnamed_param_ids = all_param_ids - named_param_ids + if len(unnamed_param_ids): + raise ValueError('named_parameters was specified, but one or more model ' + 'parameters were not named. Python object ids: ' + '%s' % ', '.join(str(id) for id in unnamed_param_ids)) + + self._parameter_names = {v: k for k, v in sorted(named_parameters)} + self.backward_passes_per_step = backward_passes_per_step + self._allreduce_delay = {v: self.backward_passes_per_step + for _, v in sorted(named_parameters)} + self.op = op + self.gradient_predivide_factor = gradient_predivide_factor + self.sparse_as_dense = sparse_as_dense + self.process_set = process_set + + self._handles = {} + self._grad_accs = [] + self._requires_update = set() + self._synchronized = False + self._should_synchronize = True + + if groups is not None: + if not (isinstance(groups, list) or groups > 0): + raise ValueError('groups should be a non-negative integer or ' + 'a list of list of torch.Tensor.') + if isinstance(groups, list): + grouped_parameter_ids = set() + for l in groups: + for p in l: + if not isinstance(p, torch.Tensor): + raise ValueError('groups must consist of torch.Tensor.') + if id(p) in grouped_parameter_ids: + raise ValueError('A parameter can only appear once in groups.') + grouped_parameter_ids.add(id(p)) + self._groups = groups + self._p_to_group = {} + self._group_counts = {} + + if self.process_set.included() and (size() > 1 or os.environ.get('HOROVOD_ELASTIC') == '1'): + self._register_hooks() + + def load_state_dict(self, *args, **kwargs): + self._handles = {} + self._synchronized = False + self._should_synchronize = True + for p in self._allreduce_delay: + self._allreduce_delay[p] = self.backward_passes_per_step + super(self.__class__, self).load_state_dict(*args, **kwargs) + + @staticmethod + def find_duplicates(lst): + seen = set() + dups = set() + for el in lst: + if el in seen: + dups.add(el) + seen.add(el) + return dups + + def set_backward_passes_per_step(self, passes): + self.backward_passes_per_step = passes + for p in self._allreduce_delay: + self._allreduce_delay[p] = self.backward_passes_per_step + + def _register_hooks(self): + if self._groups is not None: + p_list = [] + # Get list of parameters with grads + for param_group in self.param_groups: + for p in param_group['params']: + if p.requires_grad: + p_list.append(p) + + # To ensure parameter order and group formation is consistent, broadcast p_list order + # from rank 0 and use for every worker + p_list_names = [self._parameter_names.get(p) for p in p_list] + p_list_names = broadcast_object(p_list_names, root_rank=0, process_set=self.process_set) + p_list = sorted(p_list, key=lambda p: p_list_names.index(self._parameter_names.get(p))) + + # Form groups + if isinstance(self._groups, list): + p_groups = [] + grouped_id = set() + p_list_ids = [id(p) for p in p_list] + for group in self._groups: + p_groups.append([p for p in group if id(p) in p_list_ids]) + for p in p_groups[-1]: + grouped_id.add(id(p)) + for p in p_list: + if id(p) not in grouped_id: + p_groups.append([p]) + else: + p_groups = split_list(p_list, self._groups) + + p_groups = [tuple(p) for p in p_groups] + for group in p_groups: + for p in group: + self._p_to_group[p] = group + self._group_counts[group] = 0 + + for param_group in self.param_groups: + for p in param_group['params']: + if p.requires_grad: + p.grad = p.data.new(p.size()).zero_() + self._requires_update.add(p) + p_tmp = p.expand_as(p) + grad_acc = p_tmp.grad_fn.next_functions[0][0] + grad_acc.register_hook(self._make_hook(p)) + self._grad_accs.append(grad_acc) + + def _allreduce_grad_async(self, p): + if p.grad is None: + # Gradient was not computed, but we still need to submit a tensor to allreduce + # as one of the other ranks may have computed it (due to dynamic forward functions). + # + # NOTE: this will not work if the gradient is sparse and we perform an allgather. + # Unfrotunately, there doesn't appear to be a good way to detect that the parameter will + # produce sparse gradients before computing the gradient. + p.grad = p.data.new(p.size()).zero_() + + name = self._parameter_names.get(p) + tensor = p.grad + + if p.grad.is_sparse: + if self.sparse_as_dense: + tensor = tensor.to_dense() + else: + return self._sparse_allreduce_grad_async(p, name) + if self._grace and self._groups is None and self.op == Average: + handle, ctx = self._grace.send_step(tensor, name,self._process_set) + postscale_factor = self.gradient_predivide_factor + else: + + tensor_compressed, ctx = self._compression.compress(tensor) + + if self.op == Average: + # Split average operation across pre/postscale factors + # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. + prescale_factor = 1.0 / self.gradient_predivide_factor + postscale_factor = self.gradient_predivide_factor + else: + prescale_factor = 1.0 + postscale_factor = 1.0 + + handle = allreduce_async_(tensor_compressed, name=name, op=self.op, + prescale_factor=prescale_factor, + postscale_factor=postscale_factor, + process_set=self.process_set) + return handle, ctx + + def _grouped_allreduce_grad_async(self, ps): + name = self._parameter_names.get(ps[0]) + tensors_compressed, ctxs = zip(*[self._compression.compress(p.grad) for p in ps]) + + handle = grouped_allreduce_async_(tensors_compressed, name=name, op=self.op, + process_set=self.process_set) + return handle, ctxs + + def _sparse_allreduce_grad_async(self, p, name): + handle = sparse_allreduce_async(p.grad, name=name, op=self.op, + process_set=self.process_set) + return handle, None + + def _make_hook(self, p): + def hook(*ignore): + if p in self._handles and self._handles[p][0] is not None: + if self._allreduce_delay[p] <= 0: + raise AssertionError( + "Gradients were computed more than " + "backward_passes_per_step times before call " + "to step(). Increase backward_passes_per_step to " + "accumulate gradients locally.") + assert not p.grad.requires_grad + assert self._allreduce_delay[p] > 0 + handle, ctx = None, None + self._allreduce_delay[p] -= 1 + if self._allreduce_delay[p] == 0: + if self._groups is not None: + group = self._p_to_group[p] + self._group_counts[group] += 1 + if self._group_counts[group] == len(group): + handle, ctxs = self._grouped_allreduce_grad_async(group) + self._handles[group] = (handle, ctxs) + # Remove any None entries from previous no-op hook calls + for gp in group: + self._handles.pop(gp, None) + self._group_counts[group] = 0 + return + else: + handle, ctx = self._allreduce_grad_async(p) + self._handles[p] = (handle, ctx) + return hook + + def synchronize(self): + if not self.process_set.included(): + self._synchronized = True + return + + completed = set() + for x in self._handles.keys(): + completed.update(x) if isinstance(x, tuple) else completed.add(x) + missing_p = self._requires_update - completed + for p in missing_p: + handle, ctx = self._allreduce_grad_async(p) + self._handles[p] = (handle, ctx) + + for p, (handle, ctx) in self._handles.items(): + if handle is None: + handle, ctx = self._allreduce_grad_async(p) + self._handles[p] = (handle, ctx) + for p, (handle, ctx) in self._handles.items(): + + if isinstance(p, tuple): + # This was a grouped result, need to unpack + outputs = synchronize(handle) + for gp, output, gctx in zip(p, outputs, ctx): + self._allreduce_delay[gp] = self.backward_passes_per_step + gp.grad.set_(self._compression.decompress(output, gctx)) + if self._groups is not None and self._group_counts[p] != 0: + self._group_counts[p] = 0 + else: + # When handle is a callable function, it returns the aggregated tensor result + if self._grace and self._groups is None and self.op == Average: + output = self._grace.receive_step(handle, ctx,self._process_set) + p.grad.set_(output) + else: + output = synchronize(handle) if not callable(handle) else handle() + self._allreduce_delay[p] = self.backward_passes_per_step + if self._groups is not None: + group = self._p_to_group[p] + if self._group_counts[group] != 0: + self._group_counts[group] = 0 + if p.grad.is_sparse: + aggregated = self._compression.decompress(output, ctx) + if not aggregated.is_sparse: + # When sparse_as_dense=True we need to convert the grad back to sparse before update + aggregated = aggregated.to_sparse() + + # Sparse grads do not support set_ for some reason, so we do this as an equivalent + p.grad.zero_().add_(aggregated) + p.grad.set_(self._compression.decompress(output, ctx)) + self._handles.clear() + + self._synchronized = True + + @contextmanager + def skip_synchronize(self): + """ + A context manager used to specify that optimizer.step() should + not perform synchronization. + + It's typically used in a following pattern: + + .. code-block:: python + + optimizer.synchronize() + with optimizer.skip_synchronize(): + optimizer.step() + """ + self._should_synchronize = False + try: + yield + finally: + self._should_synchronize = True + + def step(self, closure=None): + if self._should_synchronize: + if self._synchronized: + warnings.warn("optimizer.step() called without " + "optimizer.skip_synchronize() context after " + "optimizer.synchronize(). This can cause training " + "slowdown. You may want to consider using " + "optimizer.skip_synchronize() context if you use " + "optimizer.synchronize() in your code.") + self.synchronize() + self._synchronized = False + return super(self.__class__, self).step(closure) + + def zero_grad(self): + if self._handles: + raise AssertionError("optimizer.zero_grad() was called after loss.backward() " + "but before optimizer.step() or optimizer.synchronize(). " + "This is prohibited as it can cause a race condition.") + return super(self.__class__, self).zero_grad() + + +class _DistributedAdasumOptimizer(torch.optim.Optimizer): + def __init__(self, params, named_parameters, compression, + backward_passes_per_step=1): + super(self.__class__, self).__init__(params) + + self._compression = compression + + if named_parameters is not None: + named_parameters = list(named_parameters) + else: + named_parameters = [('allreduce.noname.%s' % i, v) + for param_group in self.param_groups + for i, v in enumerate(param_group['params'])] + + # make sure that named_parameters are tuples + if any([not isinstance(p, tuple) for p in named_parameters]): + raise ValueError('named_parameters should be a sequence of ' + 'tuples (name, parameter), usually produced by ' + 'model.named_parameters().') + + dups = _DistributedOptimizer.find_duplicates([k for k, _ in named_parameters]) + if len(dups) > 0: + raise ValueError('Parameter names in named_parameters must be unique. ' + 'Found duplicates: %s' % ', '.join(dups)) + + all_param_ids = {id(v) + for param_group in self.param_groups + for v in param_group['params']} + named_param_ids = {id(v) for k, v in named_parameters} + unnamed_param_ids = all_param_ids - named_param_ids + if len(unnamed_param_ids): + raise ValueError('named_parameters was specified, but one or more model ' + 'parameters were not named. Python object ids: ' + '%s' % ', '.join(str(id) for id in unnamed_param_ids)) + + self._parameter_names = {v: k for k, v in sorted(named_parameters)} + self.backward_passes_per_step = backward_passes_per_step + self._allreduce_delay = {v: self.backward_passes_per_step + for _, v in sorted(named_parameters)} + self._handles = {} + self._grad_accs = [] + self._requires_update = set() + self._synchronized = False + self._should_synchronize = True + + self._starting_models = { + p : torch.zeros_like(p, requires_grad=False) + for _, p in named_parameters + } + + self._register_hooks() + + def set_backward_passes_per_step(self, passes): + self.backward_passes_per_step = passes + for p in self._allreduce_delay: + self._allreduce_delay[p] = self.backward_passes_per_step + + def _register_hooks(self): + for param_group in self.param_groups: + for p in param_group['params']: + if p.requires_grad: + p.grad = p.data.new(p.size()).zero_() + self._requires_update.add(p) + p_tmp = p.expand_as(p) + grad_acc = p_tmp.grad_fn.next_functions[0][0] + grad_acc.register_hook(self._make_hook(p)) + self._grad_accs.append(grad_acc) + + def _allreduce_grad_async(self, p): + # Delta optimizer implements this logic: + # start = current.copy() + # step() -> computes 'current - \alpha.f(g)' where f is + # optimizer logic and g is the gradient + # delta = current-start + # allreduce_(delta) + # start += delta + # current = start + # In order to suppport this logic using function hook to improve performance, + # we do: + # delta = (start - \alpha.f(g)) - start + # = -\alpha.f(g) + # set start to zero and step computes -\alpha.f(g) + # where f is the underlying optimizer logic + + name = self._parameter_names.get(p) + start = self._starting_models[p] + + stashed_params = [] + for group in self.param_groups: + stashed_params.append(group['params']) + # only want to step on p + if any([p is v for v in group['params']]): + group['params'] = [p] + else: + group['params'] = [] + + start.data.copy_(p) + + super(self.__class__, self).step() + + # compute delta = curr - start + p.data.sub_(start) + + # allreduce as before + tensor_compressed, ctx = self._compression.compress(p) + handle = allreduce_async_(tensor_compressed.data, name=name, op=Adasum) + + # reset stashed parameters + for stashed, group in zip(stashed_params, self.param_groups): + group['params'] = stashed + + return handle, ctx + + def _make_hook(self, p): + def hook(*ignore): + if p in self._handles and self._handles[p][0] is not None: + if self._allreduce_delay[p] <= 0: + raise AssertionError( + "Gradients were computed more than " + "backward_passes_per_step times before call " + "to step(). Increase backward_passes_per_step to " + "accumulate gradients locally.") + assert not p.grad.requires_grad + assert self._allreduce_delay[p] > 0 + handle, ctx = None, None + self._allreduce_delay[p] -= 1 + if self._allreduce_delay[p] == 0: + handle, ctx = self._allreduce_grad_async(p) + self._handles[p] = (handle, ctx) + return hook + + def synchronize(self): + pass + + @contextmanager + def skip_synchronize(self): + raise AssertionError("Skipping synchronization is not supported when using Adasum optimizer.") + + def step(self, closure=None): + loss = None + if closure is not None: + loss = closure() + + missing_p = self._requires_update - set(self._handles.keys()) + for p in missing_p: + handle, ctx = self._allreduce_grad_async(p) + self._handles[p] = (handle, ctx) + + for p, (handle, ctx) in self._handles.items(): + # This means step() is called before backward_passes_per_steps finished. + # We do a synchoronous allreduce here. + if not handle: + handle, ctx = self._allreduce_grad_async(p) + self._handles[p] = (handle, ctx) + delta = synchronize(handle) + delta = self._compression.decompress(delta, ctx) + start = self._starting_models[p] + start.data.add_(delta.data) + p.data.copy_(start) + self._allreduce_delay[p] = self.backward_passes_per_step + self._handles.clear() + return loss + + def zero_grad(self): + if self._handles: + raise AssertionError("optimizer.zero_grad() was called after loss.backward() " + "but before optimizer.step() or optimizer.synchronize(). " + "This is prohibited as it can cause a race condition.") + return super(self.__class__, self).zero_grad() + + +def DistributedOptimizer(optimizer, named_parameters=None,grace=None, + compression=Compression.none, + backward_passes_per_step=1, + op=Average, + gradient_predivide_factor=1.0, + num_groups=0, groups=None, + sparse_as_dense=False, + process_set=global_process_set): + """ + An optimizer that wraps another torch.optim.Optimizer, using an allreduce to + combine gradient values before applying gradients to model weights. + + Allreduce operations are executed after each gradient is computed by ``loss.backward()`` + in parallel with each other. The ``step()`` method ensures that all allreduce operations are + finished before applying gradients to the model. + + DistributedOptimizer exposes the ``synchronize()`` method, which forces allreduce operations + to finish before continuing the execution. It's useful in conjunction with gradient + clipping, or other operations that modify gradients in place before ``step()`` is executed. + Make sure to use ``optimizer.skip_synchronize()`` if you're calling ``synchronize()`` + in your code. + + Example of gradient clipping: + + .. code-block:: python + + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.synchronize() + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) + with optimizer.skip_synchronize(): + optimizer.step() + + Arguments: + optimizer: Optimizer to use for computing gradients and applying updates. + named_parameters: A mapping between parameter names and values. Used for naming of + allreduce operations. Typically just ``model.named_parameters()``. + compression: Compression algorithm used during allreduce to reduce the amount + of data sent during the each parameter update step. Defaults to + not using compression. + backward_passes_per_step: Number of expected backward passes to perform + before calling step()/synchronize(). This + allows accumulating gradients over multiple + mini-batches before reducing and applying them. + op: The reduction operation to use when combining gradients across different ranks. + gradient_predivide_factor: If op == Average, gradient_predivide_factor splits the averaging + before and after the sum. Gradients are scaled by + 1.0 / gradient_predivide_factor before the sum and + gradient_predivide_factor / size after the sum. + num_groups: Number of groups to assign gradient allreduce ops to for explicit + grouping. Defaults to no explicit groups. + groups: The parameter to group the gradient allreduce ops. Accept values is a + non-negative integer or a list of list of torch.Tensor. + If groups is a non-negative integer, it is the number of groups to assign + gradient allreduce ops to for explicit grouping. + If groups is a list of list of torch.Tensor. Tensors in the same + inner list will be assigned to the same group, while parameter that does + not appear in any list will form a group itself. + Defaults as None, which is no explicit groups. + sparse_as_dense: If set True, convert all sparse gradients to dense and perform allreduce, then + convert back to sparse before applying the update. + process_set: Gradients will only be reduced over Horovod processes belonging + to this process set. Defaults to the global process set. + """ + # We dynamically create a new class that inherits from the optimizer that was passed in. + # The goal is to override the `step()` method with an allreduce implementation. + if gradient_predivide_factor != 1.0: + if rocm_built(): + raise ValueError('gradient_predivide_factor not supported yet with ROCm') + if op != Average: + raise ValueError('gradient_predivide_factor not supported with op != Average') + + if num_groups != 0: + warnings.warn('Parameter `num_groups` has been replaced by `groups` ' + 'and will be removed in v0.23.0.', DeprecationWarning) + if groups is None: + groups = num_groups + + if backward_passes_per_step <= 0: + raise ValueError("backward_passes_per_step must be > 0") + + if op != Adasum or size() == 1: + cls = type(optimizer.__class__.__name__, (optimizer.__class__,), + dict(_DistributedOptimizer.__dict__)) + return cls(optimizer.param_groups, named_parameters,grace, compression, backward_passes_per_step, op, + gradient_predivide_factor, groups, sparse_as_dense, process_set) + else: + if process_set != global_process_set: + raise NotImplementedError("Adasum does not support non-global process sets yet.") + cls = type(optimizer.__class__.__name__, (optimizer.__class__,), + dict(_DistributedAdasumOptimizer.__dict__)) + return cls(optimizer.param_groups, named_parameters, compression, backward_passes_per_step) From eceab88abfa3425f07522e52ac55ece42d5accaf Mon Sep 17 00:00:00 2001 From: Garvit Banga Date: Mon, 22 Jul 2024 00:54:12 -0400 Subject: [PATCH 09/44] HOROVOD new version compatible for pytorch --- .DS_Store | Bin 8196 -> 8196 bytes grace_dl/.DS_Store | Bin 6148 -> 0 bytes grace_dl/dist/.DS_Store | Bin 6148 -> 0 bytes grace_dl/tensorflow/.DS_Store | Bin 8196 -> 0 bytes grace_dl/tensorflow/communicator/.DS_Store | Bin 6148 -> 0 bytes grace_dl/tensorflow/compressor/.DS_Store | Bin 6148 -> 0 bytes grace_dl/tensorflow/memory/.DS_Store | Bin 6148 -> 0 bytes grace_dl/torch/.DS_Store | Bin 8196 -> 0 bytes grace_dl/torch/communicator/.DS_Store | Bin 6148 -> 0 bytes grace_dl/torch/compressor/.DS_Store | Bin 6148 -> 0 bytes grace_dl/torch/memory/.DS_Store | Bin 6148 -> 0 bytes patch_files/.DS_Store | Bin 6148 -> 0 bytes patch_files/horovod/.DS_Store | Bin 6148 -> 0 bytes patch_files/horovod/torch/.DS_Store | Bin 6148 -> 0 bytes untitled folder/.DS_Store | Bin 6148 -> 0 bytes untitled folder/grace_dl/.DS_Store | Bin 6148 -> 0 bytes untitled folder/grace_dl/__init__.py | 1 - untitled folder/grace_dl/dist/.DS_Store | Bin 6148 -> 0 bytes untitled folder/grace_dl/dist/__init__.py | 51 - .../grace_dl/dist/communicator/all_to_all.py | 124 - .../grace_dl/dist/communicator/allgather.py | 45 - .../grace_dl/dist/communicator/allreduce.py | 13 - .../grace_dl/dist/communicator/broadcast.py | 33 - .../dist/compressor/cnat_cuda/cnat.cpp | 29 - .../dist/compressor/cnat_cuda/cnat_cuda.cu | 184 -- .../dist/compressor/cnat_cuda/setup.py | 14 - .../grace_dl/dist/compressor/dgc.py | 50 - .../grace_dl/dist/compressor/efsignsgd.py | 33 - .../grace_dl/dist/compressor/fp16.py | 22 - .../grace_dl/dist/compressor/natural.py | 57 - .../grace_dl/dist/compressor/none.py | 12 - .../grace_dl/dist/compressor/onebit.py | 31 - .../grace_dl/dist/compressor/powersgd.py | 65 - .../grace_dl/dist/compressor/qsgd.py | 79 - .../dist/compressor/qsgd_cuda/example.py | 16 - .../dist/compressor/qsgd_cuda/qsgd.cpp | 24 - .../dist/compressor/qsgd_cuda/qsgd_cuda.cu | 538 ----- .../dist/compressor/qsgd_cuda/setup.py | 14 - .../cub/agent/agent_histogram.cuh | 787 ------ .../cub/agent/agent_radix_sort_downsweep.cuh | 772 ------ .../cub/agent/agent_radix_sort_upsweep.cuh | 526 ---- .../radixtopk_cuda/cub/agent/agent_reduce.cuh | 385 --- .../cub/agent/agent_reduce_by_key.cuh | 549 ----- .../radixtopk_cuda/cub/agent/agent_rle.cuh | 837 ------- .../radixtopk_cuda/cub/agent/agent_scan.cuh | 471 ---- .../cub/agent/agent_segment_fixup.cuh | 375 --- .../cub/agent/agent_select_if.cuh | 703 ------ .../cub/agent/agent_spmv_orig.cuh | 925 ------- .../cub/agent/single_pass_scan_operators.cuh | 815 ------- .../cub/block/block_adjacent_difference.cuh | 596 ----- .../cub/block/block_discontinuity.cuh | 1148 --------- .../cub/block/block_exchange.cuh | 1248 ---------- .../cub/block/block_histogram.cuh | 415 ---- .../radixtopk_cuda/cub/block/block_load.cuh | 1268 ---------- .../cub/block/block_radix_rank.cuh | 697 ------ .../cub/block/block_radix_sort.cuh | 862 ------- .../cub/block/block_raking_layout.cuh | 152 -- .../radixtopk_cuda/cub/block/block_reduce.cuh | 607 ----- .../radixtopk_cuda/cub/block/block_scan.cuh | 2126 ----------------- .../cub/block/block_shuffle.cuh | 305 --- .../radixtopk_cuda/cub/block/block_store.cuh | 1000 -------- .../block_histogram_atomic.cuh | 82 - .../specializations/block_histogram_sort.cuh | 226 -- .../specializations/block_reduce_raking.cuh | 222 -- .../block_reduce_raking_commutative_only.cuh | 199 -- .../block_reduce_warp_reductions.cuh | 222 -- .../specializations/block_scan_raking.cuh | 666 ------ .../specializations/block_scan_warp_scans.cuh | 392 --- .../block_scan_warp_scans2.cuh | 436 ---- .../block_scan_warp_scans3.cuh | 418 ---- .../compressor/radixtopk_cuda/cub/cub.cuh | 95 - .../cub/device/device_histogram.cuh | 866 ------- .../cub/device/device_partition.cuh | 273 --- .../cub/device/device_radix_sort.cuh | 796 ------ .../cub/device/device_reduce.cuh | 734 ------ .../cub/device/device_run_length_encode.cuh | 278 --- .../radixtopk_cuda/cub/device/device_scan.cuh | 443 ---- .../device/device_segmented_radix_sort.cuh | 875 ------- .../cub/device/device_segmented_reduce.cuh | 619 ----- .../cub/device/device_select.cuh | 369 --- .../radixtopk_cuda/cub/device/device_spmv.cuh | 174 -- .../device/dispatch/dispatch_histogram.cuh | 1096 --------- .../device/dispatch/dispatch_radix_sort.cuh | 1657 ------------- .../cub/device/dispatch/dispatch_reduce.cuh | 882 ------- .../dispatch/dispatch_reduce_by_key.cuh | 554 ----- .../cub/device/dispatch/dispatch_rle.cuh | 538 ----- .../cub/device/dispatch/dispatch_scan.cuh | 563 ----- .../device/dispatch/dispatch_select_if.cuh | 542 ----- .../device/dispatch/dispatch_spmv_orig.cuh | 850 ------- .../radixtopk_cuda/cub/grid/grid_barrier.cuh | 211 -- .../cub/grid/grid_even_share.cuh | 222 -- .../radixtopk_cuda/cub/grid/grid_mapping.cuh | 113 - .../radixtopk_cuda/cub/grid/grid_queue.cuh | 220 -- .../radixtopk_cuda/cub/host/mutex.cuh | 171 -- .../cub/iterator/arg_index_input_iterator.cuh | 259 -- .../cache_modified_input_iterator.cuh | 240 -- .../cache_modified_output_iterator.cuh | 254 -- .../cub/iterator/constant_input_iterator.cuh | 235 -- .../cub/iterator/counting_input_iterator.cuh | 228 -- .../cub/iterator/discard_output_iterator.cuh | 220 -- .../cub/iterator/tex_obj_input_iterator.cuh | 310 --- .../cub/iterator/tex_ref_input_iterator.cuh | 374 --- .../cub/iterator/transform_input_iterator.cuh | 252 -- .../radixtopk_cuda/cub/thread/thread_load.cuh | 438 ---- .../cub/thread/thread_operators.cuh | 317 --- .../cub/thread/thread_reduce.cuh | 152 -- .../radixtopk_cuda/cub/thread/thread_scan.cuh | 268 --- .../cub/thread/thread_search.cuh | 154 -- .../cub/thread/thread_store.cuh | 422 ---- .../radixtopk_cuda/cub/util_allocator.cuh | 708 ------ .../radixtopk_cuda/cub/util_arch.cuh | 151 -- .../radixtopk_cuda/cub/util_debug.cuh | 145 -- .../radixtopk_cuda/cub/util_device.cuh | 347 --- .../radixtopk_cuda/cub/util_macro.cuh | 103 - .../radixtopk_cuda/cub/util_namespace.cuh | 46 - .../radixtopk_cuda/cub/util_ptx.cuh | 729 ------ .../radixtopk_cuda/cub/util_type.cuh | 1141 --------- .../warp/specializations/warp_reduce_shfl.cuh | 551 ----- .../warp/specializations/warp_reduce_smem.cuh | 375 --- .../warp/specializations/warp_scan_shfl.cuh | 656 ----- .../warp/specializations/warp_scan_smem.cuh | 397 --- .../radixtopk_cuda/cub/warp/warp_reduce.cuh | 612 ----- .../radixtopk_cuda/cub/warp/warp_scan.cuh | 936 -------- .../dist/compressor/radixtopk_cuda/example.py | 31 - .../compressor/radixtopk_cuda/rdxtopk.cpp | 18 - .../compressor/radixtopk_cuda/rdxtopk_cuda.cu | 582 ----- .../dist/compressor/radixtopk_cuda/setup.py | 14 - .../grace_dl/dist/compressor/randomk.py | 41 - .../grace_dl/dist/compressor/signsgd.py | 30 - .../grace_dl/dist/compressor/signum.py | 37 - .../grace_dl/dist/compressor/terngrad.py | 32 - .../grace_dl/dist/compressor/threshold.py | 27 - .../grace_dl/dist/compressor/topk.py | 69 - untitled folder/grace_dl/dist/helper.py | 102 - untitled folder/grace_dl/dist/memory/dgc.py | 39 - .../grace_dl/dist/memory/efsignsgd.py | 19 - untitled folder/grace_dl/dist/memory/none.py | 11 - .../grace_dl/dist/memory/powersgd.py | 37 - .../grace_dl/dist/memory/residual.py | 20 - untitled folder/grace_dl/tensorflow/.DS_Store | Bin 8196 -> 0 bytes .../grace_dl/tensorflow/__init__.py | 114 - .../tensorflow/communicator/.DS_Store | Bin 6148 -> 0 bytes .../tensorflow/communicator/allgather.py | 46 - .../tensorflow/communicator/allreduce.py | 20 - .../tensorflow/communicator/broadcast.py | 19 - .../grace_dl/tensorflow/compressor/.DS_Store | Bin 6148 -> 0 bytes .../grace_dl/tensorflow/compressor/adaq.py | 93 - .../grace_dl/tensorflow/compressor/dgc.py | 68 - .../tensorflow/compressor/efsignsgd.py | 40 - .../grace_dl/tensorflow/compressor/fp16.py | 25 - .../tensorflow/compressor/inceptionn.py | 188 -- .../grace_dl/tensorflow/compressor/natural.py | 42 - .../grace_dl/tensorflow/compressor/none.py | 14 - .../grace_dl/tensorflow/compressor/onebit.py | 42 - .../grace_dl/tensorflow/compressor/packing.py | 30 - .../tensorflow/compressor/powersgd.py | 58 - .../grace_dl/tensorflow/compressor/qsgd.py | 44 - .../grace_dl/tensorflow/compressor/randomk.py | 53 - .../grace_dl/tensorflow/compressor/signsgd.py | 30 - .../grace_dl/tensorflow/compressor/signum.py | 45 - .../grace_dl/tensorflow/compressor/sketch.py | 39 - .../tensorflow/compressor/terngrad.py | 42 - .../tensorflow/compressor/threshold.py | 33 - .../grace_dl/tensorflow/compressor/topk.py | 49 - .../grace_dl/tensorflow/compressor/u8bit.py | 110 - untitled folder/grace_dl/tensorflow/helper.py | 107 - .../grace_dl/tensorflow/memory/.DS_Store | Bin 6148 -> 0 bytes .../grace_dl/tensorflow/memory/dgc.py | 38 - .../grace_dl/tensorflow/memory/efsignsgd.py | 24 - .../grace_dl/tensorflow/memory/none.py | 11 - .../grace_dl/tensorflow/memory/powersgd.py | 38 - .../grace_dl/tensorflow/memory/residual.py | 28 - untitled folder/grace_dl/torch/.DS_Store | Bin 8196 -> 0 bytes untitled folder/grace_dl/torch/__init__.py | 58 - .../grace_dl/torch/communicator/.DS_Store | Bin 6148 -> 0 bytes .../grace_dl/torch/communicator/allgather.py | 58 - .../grace_dl/torch/communicator/allreduce.py | 15 - .../grace_dl/torch/communicator/broadcast.py | 27 - .../grace_dl/torch/compressor/.DS_Store | Bin 6148 -> 0 bytes .../grace_dl/torch/compressor/dgc.py | 50 - .../grace_dl/torch/compressor/efsignsgd.py | 33 - .../grace_dl/torch/compressor/fp16.py | 22 - .../grace_dl/torch/compressor/natural.py | 40 - .../grace_dl/torch/compressor/none.py | 12 - .../grace_dl/torch/compressor/onebit.py | 32 - .../grace_dl/torch/compressor/powersgd.py | 58 - .../grace_dl/torch/compressor/qsgd.py | 39 - .../grace_dl/torch/compressor/randomk.py | 40 - .../grace_dl/torch/compressor/signsgd.py | 30 - .../grace_dl/torch/compressor/signum.py | 37 - .../grace_dl/torch/compressor/terngrad.py | 32 - .../grace_dl/torch/compressor/threshold.py | 27 - .../grace_dl/torch/compressor/topk.py | 37 - untitled folder/grace_dl/torch/helper.py | 96 - .../grace_dl/torch/memory/.DS_Store | Bin 6148 -> 0 bytes untitled folder/grace_dl/torch/memory/dgc.py | 38 - .../grace_dl/torch/memory/efsignsgd.py | 19 - untitled folder/grace_dl/torch/memory/none.py | 11 - .../grace_dl/torch/memory/powersgd.py | 37 - .../grace_dl/torch/memory/residual.py | 23 - untitled folder/patch_files/.DS_Store | Bin 6148 -> 0 bytes untitled folder/patch_files/horovod/.DS_Store | Bin 6148 -> 0 bytes .../patch_files/horovod/torch/.DS_Store | Bin 6148 -> 0 bytes .../patch_files/horovod/torch/__init__.py | 61 - .../patch_files/horovod/torch/mpi_ops.py | 1333 ----------- .../patch_files/horovod/torch/optimizer.py | 618 ----- 206 files changed, 51510 deletions(-) delete mode 100644 grace_dl/.DS_Store delete mode 100644 grace_dl/dist/.DS_Store delete mode 100644 grace_dl/tensorflow/.DS_Store delete mode 100644 grace_dl/tensorflow/communicator/.DS_Store delete mode 100644 grace_dl/tensorflow/compressor/.DS_Store delete mode 100644 grace_dl/tensorflow/memory/.DS_Store delete mode 100644 grace_dl/torch/.DS_Store delete mode 100644 grace_dl/torch/communicator/.DS_Store delete mode 100644 grace_dl/torch/compressor/.DS_Store delete mode 100644 grace_dl/torch/memory/.DS_Store delete mode 100644 patch_files/.DS_Store delete mode 100644 patch_files/horovod/.DS_Store delete mode 100644 patch_files/horovod/torch/.DS_Store delete mode 100644 untitled folder/.DS_Store delete mode 100644 untitled folder/grace_dl/.DS_Store delete mode 100644 untitled folder/grace_dl/__init__.py delete mode 100644 untitled folder/grace_dl/dist/.DS_Store delete mode 100644 untitled folder/grace_dl/dist/__init__.py delete mode 100644 untitled folder/grace_dl/dist/communicator/all_to_all.py delete mode 100644 untitled folder/grace_dl/dist/communicator/allgather.py delete mode 100644 untitled folder/grace_dl/dist/communicator/allreduce.py delete mode 100644 untitled folder/grace_dl/dist/communicator/broadcast.py delete mode 100644 untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat.cpp delete mode 100644 untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat_cuda.cu delete mode 100644 untitled folder/grace_dl/dist/compressor/cnat_cuda/setup.py delete mode 100644 untitled folder/grace_dl/dist/compressor/dgc.py delete mode 100644 untitled folder/grace_dl/dist/compressor/efsignsgd.py delete mode 100644 untitled folder/grace_dl/dist/compressor/fp16.py delete mode 100644 untitled folder/grace_dl/dist/compressor/natural.py delete mode 100644 untitled folder/grace_dl/dist/compressor/none.py delete mode 100644 untitled folder/grace_dl/dist/compressor/onebit.py delete mode 100644 untitled folder/grace_dl/dist/compressor/powersgd.py delete mode 100644 untitled folder/grace_dl/dist/compressor/qsgd.py delete mode 100644 untitled folder/grace_dl/dist/compressor/qsgd_cuda/example.py delete mode 100644 untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd.cpp delete mode 100644 untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd_cuda.cu delete mode 100644 untitled folder/grace_dl/dist/compressor/qsgd_cuda/setup.py delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_histogram.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_downsweep.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_upsweep.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce_by_key.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_rle.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_scan.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_segment_fixup.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_select_if.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_spmv_orig.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/single_pass_scan_operators.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_adjacent_difference.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_discontinuity.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_exchange.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_histogram.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_load.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_rank.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_sort.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_raking_layout.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_reduce.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_scan.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_shuffle.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_store.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_atomic.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_sort.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_warp_reductions.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_raking.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans2.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans3.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/cub.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_histogram.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_partition.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_radix_sort.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_reduce.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_run_length_encode.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_scan.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_radix_sort.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_reduce.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_select.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_spmv.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_histogram.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_radix_sort.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_rle.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_scan.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_select_if.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_spmv_orig.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_barrier.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_even_share.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_mapping.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_queue.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/host/mutex.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/arg_index_input_iterator.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_input_iterator.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_output_iterator.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/constant_input_iterator.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/counting_input_iterator.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/discard_output_iterator.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_obj_input_iterator.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_ref_input_iterator.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/transform_input_iterator.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_load.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_operators.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_reduce.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_scan.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_search.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_store.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_allocator.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_arch.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_debug.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_device.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_macro.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_namespace.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_ptx.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_type.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_shfl.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_smem.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_shfl.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_smem.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_reduce.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_scan.cuh delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/example.py delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk.cpp delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk_cuda.cu delete mode 100644 untitled folder/grace_dl/dist/compressor/radixtopk_cuda/setup.py delete mode 100644 untitled folder/grace_dl/dist/compressor/randomk.py delete mode 100644 untitled folder/grace_dl/dist/compressor/signsgd.py delete mode 100644 untitled folder/grace_dl/dist/compressor/signum.py delete mode 100644 untitled folder/grace_dl/dist/compressor/terngrad.py delete mode 100644 untitled folder/grace_dl/dist/compressor/threshold.py delete mode 100644 untitled folder/grace_dl/dist/compressor/topk.py delete mode 100644 untitled folder/grace_dl/dist/helper.py delete mode 100644 untitled folder/grace_dl/dist/memory/dgc.py delete mode 100644 untitled folder/grace_dl/dist/memory/efsignsgd.py delete mode 100644 untitled folder/grace_dl/dist/memory/none.py delete mode 100644 untitled folder/grace_dl/dist/memory/powersgd.py delete mode 100644 untitled folder/grace_dl/dist/memory/residual.py delete mode 100644 untitled folder/grace_dl/tensorflow/.DS_Store delete mode 100644 untitled folder/grace_dl/tensorflow/__init__.py delete mode 100644 untitled folder/grace_dl/tensorflow/communicator/.DS_Store delete mode 100644 untitled folder/grace_dl/tensorflow/communicator/allgather.py delete mode 100644 untitled folder/grace_dl/tensorflow/communicator/allreduce.py delete mode 100644 untitled folder/grace_dl/tensorflow/communicator/broadcast.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/.DS_Store delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/adaq.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/dgc.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/efsignsgd.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/fp16.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/inceptionn.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/natural.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/none.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/onebit.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/packing.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/powersgd.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/qsgd.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/randomk.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/signsgd.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/signum.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/sketch.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/terngrad.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/threshold.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/topk.py delete mode 100644 untitled folder/grace_dl/tensorflow/compressor/u8bit.py delete mode 100644 untitled folder/grace_dl/tensorflow/helper.py delete mode 100644 untitled folder/grace_dl/tensorflow/memory/.DS_Store delete mode 100644 untitled folder/grace_dl/tensorflow/memory/dgc.py delete mode 100644 untitled folder/grace_dl/tensorflow/memory/efsignsgd.py delete mode 100644 untitled folder/grace_dl/tensorflow/memory/none.py delete mode 100644 untitled folder/grace_dl/tensorflow/memory/powersgd.py delete mode 100644 untitled folder/grace_dl/tensorflow/memory/residual.py delete mode 100644 untitled folder/grace_dl/torch/.DS_Store delete mode 100644 untitled folder/grace_dl/torch/__init__.py delete mode 100644 untitled folder/grace_dl/torch/communicator/.DS_Store delete mode 100644 untitled folder/grace_dl/torch/communicator/allgather.py delete mode 100644 untitled folder/grace_dl/torch/communicator/allreduce.py delete mode 100644 untitled folder/grace_dl/torch/communicator/broadcast.py delete mode 100644 untitled folder/grace_dl/torch/compressor/.DS_Store delete mode 100644 untitled folder/grace_dl/torch/compressor/dgc.py delete mode 100644 untitled folder/grace_dl/torch/compressor/efsignsgd.py delete mode 100644 untitled folder/grace_dl/torch/compressor/fp16.py delete mode 100644 untitled folder/grace_dl/torch/compressor/natural.py delete mode 100644 untitled folder/grace_dl/torch/compressor/none.py delete mode 100644 untitled folder/grace_dl/torch/compressor/onebit.py delete mode 100644 untitled folder/grace_dl/torch/compressor/powersgd.py delete mode 100644 untitled folder/grace_dl/torch/compressor/qsgd.py delete mode 100644 untitled folder/grace_dl/torch/compressor/randomk.py delete mode 100644 untitled folder/grace_dl/torch/compressor/signsgd.py delete mode 100644 untitled folder/grace_dl/torch/compressor/signum.py delete mode 100644 untitled folder/grace_dl/torch/compressor/terngrad.py delete mode 100644 untitled folder/grace_dl/torch/compressor/threshold.py delete mode 100644 untitled folder/grace_dl/torch/compressor/topk.py delete mode 100644 untitled folder/grace_dl/torch/helper.py delete mode 100644 untitled folder/grace_dl/torch/memory/.DS_Store delete mode 100644 untitled folder/grace_dl/torch/memory/dgc.py delete mode 100644 untitled folder/grace_dl/torch/memory/efsignsgd.py delete mode 100644 untitled folder/grace_dl/torch/memory/none.py delete mode 100644 untitled folder/grace_dl/torch/memory/powersgd.py delete mode 100644 untitled folder/grace_dl/torch/memory/residual.py delete mode 100644 untitled folder/patch_files/.DS_Store delete mode 100644 untitled folder/patch_files/horovod/.DS_Store delete mode 100644 untitled folder/patch_files/horovod/torch/.DS_Store delete mode 100644 untitled folder/patch_files/horovod/torch/__init__.py delete mode 100644 untitled folder/patch_files/horovod/torch/mpi_ops.py delete mode 100644 untitled folder/patch_files/horovod/torch/optimizer.py diff --git a/.DS_Store b/.DS_Store index a5368b17789b4e20b560a7aeab0f7e415f4f348a..bad07e965141e67ad73cab84e3ab9c0ed6fb1380 100644 GIT binary patch delta 32 mcmZp1XmOa}&nUVvU^hRb=w==PbJmR|N0>IVOV|Jz2bln?Dhf~l delta 704 zcmZp1XmOa}&nUAoU^hRb%w`?|b5?nNhEj$+h7yKMAkJY(Wk_LAU`S)g2eMOuyrRhp z0u7V>ggojAs7onM&PmG8&tU)p0!nkz4TF>Oa|=L9p@4vn+foy`>QpnI%kb%v48u$XKEVw8yCqFM8=p@F8h1)l?OW2GCEJjkIIAFH`06_qk A8UO$Q diff --git a/grace_dl/.DS_Store b/grace_dl/.DS_Store deleted file mode 100644 index f530e2364144b4e446d40dbda61eafc6e2c1a79a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK&2AGh5FV!~-LwKC0fJnRQLjNHZAw*fSwgsQLM(y4lxB|UNmkj0Y2$ z5C#Jbd3ZmJ1Ch1GG!7D(>nVp*aVp(vZ8mFdx9YsPyFIV-S+lXbRp+g}o%y`toZGx| z?N0YuG>XLw`t*1Z9v*%tRsX1diLJuTnXDHE

11-z=NRRO3VX}ndjzqkVej;HNl_U-OF#OZ{UEbNN^R27RU)9gplBt?8oYM+kUCQ%d zD#ULcB?nf^`#@-8*~O?5VwMp%bsz%uZ^GC=18$3p08Tqu-B z2UhY4fLO+1Y1pQ>1m$pzuEvEz^q?`7il|bVequ0{j^kX%xf&M=RXQ;J_+WZvre7#b zu8#9_84k==J#@zzUCXgN s!9qrH3xzTTD}5cy23^G$ut-B6rv{>{aiI`L(Ci-pNrSB{1HYAluVJbE9{>OV diff --git a/grace_dl/dist/.DS_Store b/grace_dl/dist/.DS_Store deleted file mode 100644 index 0e5427f8fe4b097ce86acb1b96e871829d3c7ce3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKyG{c^3>-s*BA`e~xxc_4tfKG*`~Z-oKnfhnqoBKv-)8JJ=-@(20ppcDcRtUq zZi@36fXxr9YhVUoN_WJYhq3u{_mQ1s=9Fl$#{tiH#1i+|kE)L+oO?+|hbOKfzr%L5 zTW%h@ZsXMREPBBfYwWnb6`Ya3rZbJ#A@lgb%1Qw#AO)m=6!@nKuxGnXFB&RJ0VyB_ zJ{9oqL!mp?#J*vCIv8REAWoPL<8{mu#Nr8JP3#*oL$gLDHmc=_VU5mw$-0`@H*9oR z4j+~$TTUnzr*r=j<*?dNQ3^_f xHNDps`W^klSR3UG(Ta)Dih1L$_-asB{F={eV&5?6%m4BVYgk diff --git a/grace_dl/tensorflow/.DS_Store b/grace_dl/tensorflow/.DS_Store deleted file mode 100644 index 354ac1b19a8cd736f5444b51fb428fc7b68e8ed2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHM%Wl&^6ur}i#%U?aqLtbeme@ujp+Oa5W74ok7{LNiuw$#WusxRTG^(OV-qYXU z2M`;!EcgWefju9I?=FLYxb z;5jb3ynEw?4xJ9@ByT8dIvh&IG&psC z-P`~1Y_-30CI9Fozm_grQti4=%{cUFc#J-boK2~61Tnaf;BkW#uH0N+-Aad|vR^jq zSjwx~{Jc=La?F5UD*89yf|`VX!79frDY=1Hs*4r%gR`4*jTSbQSFT_vDz@_-6)V{D zO8x%2P^#o}sZgCyK^b^~(-h|^DZ=ckCiN$_P|dAl;1AJaWw!tT diff --git a/grace_dl/tensorflow/communicator/.DS_Store b/grace_dl/tensorflow/communicator/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0fgc5S?v9>(Bx@v{HMs#5IVdqD2)KH-tOF2o8XP9b2u1?X}{BsEQ)_4u6O% z2Yv!vIl?bM+?d(jrgmZn^@3=-W9>d?cV<7{i@llz09a?_wE=bjz`#P-*ubj9sGrJ$ zt*Dl5M27JIBZwh@J~%L3@YW5Q0nLDBKr^5j&lQ}vRBud0nNa*WPsie zCKf_VVJ1<$IR58IJuXxg%N%Gl?=B zm<$J|W@aiwVPbZSGle@aOQNnd1Db(F1}Jym#EhLogjYU)=P+=`Y2t?5MW0V@5(eq8 z)%wa-R%fxcvGth8Lnny+fd~jZ zA6;I*3}TOuJA4#-sfe{@hfz1`y~gfja&P~zY3?1gr%iLxZav&tgDzPE*rqGFBB`To38 z!S9(;g)t(2u27XvQJJTtOks>UMW|n!X5ex$uqv0kL;3%c>fiq_*QWGpH3OP~3o*cI zUAx=C5ISzljB;%g%UdjDh_gQo@!mvNim?=C60rsCpZ^eWs?UEx KysoY_1AhR)|7g?z diff --git a/grace_dl/torch/communicator/.DS_Store b/grace_dl/torch/communicator/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0S5Z-O8O(;SS3VK`cTCpuw5icRu7cim+m717hgE3o@v^|tU?)pN$h|lB9 z?pCzbiwBW11GC>|em2=}%T9+e#@$iaVyw;>vp^9`HVoeg`cappWGrPsuGSbc$--cg z4TE?onhgJu0X(}Z8^F1xpr6i92H<%OhH;eT&E^{~l`CuORjX>%tXpp&CtmL7qs;Zw zYkazpG7jc*Ke&wglU{xER3^C}CH+(-L}4F7ZmyyvloMBuk}y-bo_1KaW%uek)9GI4 zxFOnytyx1%Tg{_(LmV7;W;5H`+TA}n?>)v(iG0>va`@RP*)TYV7nm$;E6ncW06r5# z7F=U5O=6imfM>oSMv)K$!~ij{stlOB&Z@2I25Dc!05R}u2Jn6mpoosaLZiAmU{Eap zUV4+c`Gj5s>Zr;pIhr-p{VSS;( z8Fw_&NDL4IUm3vO52TLu|Lo`fUj@-X3=jjW$pEi(y{-#;GqrVTmso2B=p85u#uXYr lQedJ=F~nji-Ud|yc7YQ>$6%omEFknpK+!-0G4Q7hyaQ+(xSXAMfhz`8(GT2E#oBs+mo0)+a|hqE<8CF}q*6d|oI#?!EVLzwqd`6B+bk6ZvoKUf_%1 z3T7Q|L@xzfWB?gJ2ELyGbMCOY?@xspkO5@ik1-(62M#LHG8k)AM+Y>j1OUu{Sqb!I zEg@sLLCavQ5k^3$4h7Vq)RY)hhl8CPKg(dOQHK+1iVtcsQ&XW(l^y2i%AHV4Beuu@ zGBC+NT0iFH{eSRz{XZFmPsji=@UIx4nVMaz!j{b4I<+}@*K*JcP$|+c*7z0z9d#9h fue^$vL6v}=%MPGrFxChb5d0BPG+=`a{3rvj5fp6x diff --git a/patch_files/horovod/torch/.DS_Store b/patch_files/horovod/torch/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0VeL9Ymtu5kVh9;K3|QOV(Cl8L$leYYfP}OJN8T@L>Q;`#0bsz`2h>%Hu3=w?C8G zR{hG=ZMsdH^rbfvGcWh^Y1Z{8&*jq-A$V{x_JgN!H0wJz9*QLQ<0P6Whd7MTR2r2g`LN>+#8_7@_Q79gGZwv`VTPW5NzZTYpPj6? z3;)V49WBMYtwNW+>#a9QxJZT=M=rJrDdsB2Y>lwmzJ?e$<}H&o_cG+HVtLD%t1wx~ zANl&Jnyck3e*Y1D+tFF9c<)&J%H^M}S29|2@FS`+_FG=1W!i z7?AaWVUy?@EHtXpft|VnAl6Z>1jnl8Au_2zbPX07QG-IXE24HK=88eIJL;LSa}5?6 zwL1_M8Q)_f6LUi$Dmv4lxB|UNmkj0Y2$ z5C#Jbd3ZmJ1Ch1GG!7D(>nVp*aVp(vZ8mFdx9YsPyFIV-S+lXbRp+g}o%y`toZGx| z?N0YuG>XLw`t*1Z9v*%tRsX1diLJuTnXDHE

11-z=NRRO3VX}ndjzqkVej;HNl_U-OF#OZ{UEbNN^R27RU)9gplBt?8oYM+kUCQ%d zD#ULcB?nf^`#@-8*~O?5VwMp%bsz%uZ^GC=18$3p08Tqu-B z2UhY4fLO+1Y1pQ>1m$pzuEvEz^q?`7il|bVequ0{j^kX%xf&M=RXQ;J_+WZvre7#b zu8#9_84k==J#@zzUCXgN s!9qrH3xzTTD}5cy23^G$ut-B6rv{>{aiI`L(Ci-pNrSB{1HYAluVJbE9{>OV diff --git a/untitled folder/grace_dl/__init__.py b/untitled folder/grace_dl/__init__.py deleted file mode 100644 index ae099b3..0000000 --- a/untitled folder/grace_dl/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '1.0' diff --git a/untitled folder/grace_dl/dist/.DS_Store b/untitled folder/grace_dl/dist/.DS_Store deleted file mode 100644 index 0e5427f8fe4b097ce86acb1b96e871829d3c7ce3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKyG{c^3>-s*BA`e~xxc_4tfKG*`~Z-oKnfhnqoBKv-)8JJ=-@(20ppcDcRtUq zZi@36fXxr9YhVUoN_WJYhq3u{_mQ1s=9Fl$#{tiH#1i+|kE)L+oO?+|hbOKfzr%L5 zTW%h@ZsXMREPBBfYwWnb6`Ya3rZbJ#A@lgb%1Qw#AO)m=6!@nKuxGnXFB&RJ0VyB_ zJ{9oqL!mp?#J*vCIv8REAWoPL<8{mu#Nr8JP3#*oL$gLDHmc=_VU5mw$-0`@H*9oR z4j+~$TTUnzr*r=j<*?dNQ3^_f xHNDps`W^klSR3UG(Ta)Dih1L$_-asB{F={eV&5?6%m4BVYgk diff --git a/untitled folder/grace_dl/dist/__init__.py b/untitled folder/grace_dl/dist/__init__.py deleted file mode 100644 index dd04733..0000000 --- a/untitled folder/grace_dl/dist/__init__.py +++ /dev/null @@ -1,51 +0,0 @@ -from abc import ABC, abstractmethod - - -class Memory(ABC): - @abstractmethod - def compensate(self, tensor, name): - """Update the tensor with the residuals.""" - raise NotImplemented("compensate was not implemented.") - - def update(self, tensor, name, compressor, tensor_compressed, ctx): - """Update the residuals.""" - pass - - -class Compressor(ABC): - """Interface for compressing and decompressing a given tensor.""" - - def __init__(self, average=True, tensors_size_are_same=True): - self.average = average - self.tensors_size_are_same = tensors_size_are_same - - @abstractmethod - def compress(self, tensor, name): - """Compresses a tensor and returns it with the context needed to decompress it.""" - raise NotImplemented("compress was not implemented.") - - @abstractmethod - def decompress(self, tensors, ctx): - """Decompress the tensor with the given context.""" - raise NotImplemented("decompress was not implemented.") - - def aggregate(self, tensors): - """Aggregate a list of tensors.""" - return sum(tensors) - - -class Communicator(ABC): - @abstractmethod - def send_receive(self, tensors, name, ctx): - raise NotImplemented("send was not implemented.") - - def __init__(self, compressor, memory, world_size): - self.compressor = compressor - self.memory = memory - self.world_size = world_size - - def step(self, tensor, name): - tensor = self.memory.compensate(tensor, name) - tensors_compressed, ctx = self.compressor.compress(tensor, name) - self.memory.update(tensor, name, self.compressor, tensors_compressed, ctx) - return self.send_receive(tensors_compressed, name, ctx) diff --git a/untitled folder/grace_dl/dist/communicator/all_to_all.py b/untitled folder/grace_dl/dist/communicator/all_to_all.py deleted file mode 100644 index c40268a..0000000 --- a/untitled folder/grace_dl/dist/communicator/all_to_all.py +++ /dev/null @@ -1,124 +0,0 @@ -import torch, math -from torch import distributed as dist - -from grace_dl.dist import Communicator - -from grace_dl.dist.compressor.qsgd import QSGDCompressor -from grace_dl.dist.compressor.qsgd import QSGDCompressor_CUDA -from grace_dl.dist.compressor.terngrad import TernGradCompressor -from grace_dl.dist.compressor.natural import NaturalCompressor -from grace_dl.dist.compressor.natural import NaturalCompressor_CUDA - - -class AllToAll(Communicator): - - def compress_allgather(self, tensors, ctx): - tensors_gathered = [] - for tensor_compressed in tensors: - tensor_list = [torch.empty_like(tensor_compressed) for _ in range(self.world_size)] - dist.all_gather(tensor_list, tensor_compressed) - tensors_gathered.append(tensor_list) - - decompressed_list = [] - for tensors_compressed in zip(*tensors_gathered): - tensor_decompressed = self.compressor.decompress(tensors_compressed, ctx) - decompressed_list.append(tensor_decompressed) - tensors_concat = torch.cat(decompressed_list, dim=0) - return tensors_concat - - def send_receive(self, tensors, name, ctx): - - if isinstance(self.compressor, QSGDCompressor) or isinstance(self.compressor, QSGDCompressor_CUDA): - tensor, norm = tensors - tensor_size = tensor.numel() - world_size = self.world_size - bucket_size = self.compressor.bucket_size - - pad_size = math.ceil(tensor.numel() / (world_size * bucket_size)) * ( - world_size * bucket_size) - tensor.numel() - padding = torch.empty(int(pad_size), dtype=tensor.dtype, layout=tensor.layout, device=tensor.device) - tensor = torch.cat((tensor, padding), dim=0) - - pad_size = tensor.numel() / bucket_size - norm.numel() - padding = torch.empty(int(pad_size), dtype=norm.dtype, layout=norm.layout, device=norm.device) - norm = torch.cat((norm, padding), dim=0) - - data = tensor - input = list(data.chunk(world_size)) - output = list(torch.empty([data.numel()], dtype=data.dtype, device=data.device).chunk(world_size)) - dist.all_to_all(output, input) - tensor_chunk_list = output - - data = norm - input = list(data.chunk(world_size)) - output = list(torch.empty([data.numel()], dtype=data.dtype, device=data.device).chunk(world_size)) - dist.all_to_all(output, input) - norm_chunk_list = output - - compressed_list = [tensor_chunk_list, norm_chunk_list] - decompressed_list = [] - for tensors_compressed in zip(*compressed_list): - chunk_shape = tensors_compressed[0].size() - tensor_decompressed = self.compressor.decompress(tensors_compressed, chunk_shape) - decompressed_list.append(tensor_decompressed) - tensors_aggregated = self.compressor.aggregate(decompressed_list) - - elif isinstance(self.compressor, TernGradCompressor): - tensor, scalar = tensors - tensor_size = tensor.numel() - world_size = self.world_size - - pad_size = math.ceil(tensor.numel() / world_size) * world_size - tensor.numel() - padding = torch.empty(int(pad_size), dtype=tensor.dtype, layout=tensor.layout, device=tensor.device) - tensor = torch.cat((tensor, padding), dim=0) - - data = tensor - input = list(data.chunk(world_size)) - output = list(torch.empty([data.numel()], dtype=data.dtype, device=data.device).chunk(world_size)) - dist.all_to_all(output, input) - tensor_chunk_list = output - - input = [scalar for _ in range(world_size)] - output = [torch.empty_like(scalar, device=scalar.device) for _ in range(world_size)] - dist.all_to_all(output, input) - scalar_chunk_list = output - - compressed_list = [tensor_chunk_list, scalar_chunk_list] - decompressed_list = [] - for tensors_compressed in zip(*compressed_list): - chunk_shape = tensors_compressed[0].size() - tensor_decompressed = self.compressor.decompress(tensors_compressed, chunk_shape) - decompressed_list.append(tensor_decompressed) - tensors_aggregated = self.compressor.aggregate(decompressed_list) - - elif isinstance(self.compressor, NaturalCompressor) or isinstance(self.compressor, NaturalCompressor_CUDA): - tensor = tensors[0] - tensor_size = tensor.numel() - world_size = self.world_size - - pad_size = math.ceil(tensor.numel() / world_size) * world_size - tensor.numel() - padding = torch.empty(int(pad_size), dtype=tensor.dtype, layout=tensor.layout, device=tensor.device) - tensor = torch.cat((tensor, padding), dim=0) - - data = tensor - input = list(data.chunk(world_size)) - output = list(torch.empty([data.numel()], dtype=data.dtype, device=data.device).chunk(world_size)) - dist.all_to_all(output, input) - tensor_chunk_list = output - - compressed_list = [[t, ] for t in tensor_chunk_list] - decompressed_list = [] - for tensors_compressed in compressed_list: - chunk_shape = tensors_compressed[0].size() - tensor_decompressed = self.compressor.decompress(tensors_compressed, chunk_shape) - decompressed_list.append(tensor_decompressed) - tensors_aggregated = self.compressor.aggregate(decompressed_list) - - else: - raise NotImplementedError(self.compressor) - - tensors_compressed, ctx2 = self.compressor.compress(tensors_aggregated, name) - tensors_concat = self.compress_allgather(tensors_compressed, ctx2) - tensors_aggregated = tensors_concat[:tensor_size].view(ctx) - - return (tensors_aggregated / self.world_size) if self.compressor.average else tensors_aggregated diff --git a/untitled folder/grace_dl/dist/communicator/allgather.py b/untitled folder/grace_dl/dist/communicator/allgather.py deleted file mode 100644 index a86f911..0000000 --- a/untitled folder/grace_dl/dist/communicator/allgather.py +++ /dev/null @@ -1,45 +0,0 @@ -import torch -from torch import distributed as dist - -from grace_dl.dist import Communicator - - -class Allgather(Communicator): - def send_receive(self, tensors, name, ctx): - if self.compressor.tensors_size_are_same: - tensors_gathered = [] - for tensor_compressed in tensors: - tensor_list = [torch.empty_like(tensor_compressed) for _ in range(self.world_size)] - dist.all_gather(tensor_list, tensor_compressed) - tensors_gathered.append(tensor_list) - else: - local_sizes = torch.tensor([t.numel() for t in tensors]).cuda(tensors[0].device.index) - gathered_sizes = [torch.empty_like(local_sizes) for _ in range(self.world_size)] - dist.all_gather(gathered_sizes, local_sizes) # tensor of tensor sizes per rank - - tensors_gathered = [] - for tensor, sizes in zip(tensors, zip(*gathered_sizes)): - local_size = tensor.numel() - max_size = max(sizes) - gathered = [] - for _ in range(self.world_size): - padded = torch.empty(max_size, dtype=tensor.dtype, layout=tensor.layout, device=tensor.device) - gathered.append(padded) - if local_size != max_size: - padding = torch.empty(max_size - local_size, dtype=tensor.dtype, layout=tensor.layout, - device=tensor.device) - tensor = torch.cat((tensor, padding), dim=0) - dist.all_gather(gathered, tensor) - - data_list = [] - for size, tensor_gathered in zip(sizes, gathered): - data_list.append(tensor_gathered[:size]) - - tensors_gathered.append(data_list) - - decompressed_list = [] - for tensors_compressed in zip(*tensors_gathered): - tensor_decompressed = self.compressor.decompress(tensors_compressed, ctx) - decompressed_list.append(tensor_decompressed) - tensors_aggregated = self.compressor.aggregate(decompressed_list) - return (tensors_aggregated / self.world_size) if self.compressor.average else tensors_aggregated diff --git a/untitled folder/grace_dl/dist/communicator/allreduce.py b/untitled folder/grace_dl/dist/communicator/allreduce.py deleted file mode 100644 index 4d8a3d2..0000000 --- a/untitled folder/grace_dl/dist/communicator/allreduce.py +++ /dev/null @@ -1,13 +0,0 @@ -from torch import distributed as dist - -from grace_dl.dist import Communicator - - -class Allreduce(Communicator): - - def send_receive(self, tensors, name, ctx): - for tensor_compressed in tensors: - dist.all_reduce(tensor_compressed) - if self.compressor.average: - tensor_compressed.div_(self.world_size) - return self.compressor.decompress(tensors, ctx) diff --git a/untitled folder/grace_dl/dist/communicator/broadcast.py b/untitled folder/grace_dl/dist/communicator/broadcast.py deleted file mode 100644 index a272752..0000000 --- a/untitled folder/grace_dl/dist/communicator/broadcast.py +++ /dev/null @@ -1,33 +0,0 @@ -import torch -from torch import distributed as dist - -from grace_dl.dist import Communicator - - -class Broadcast(Communicator): - - def __init__(self, compressor, memory, world_size, rank): - super().__init__(compressor, memory, world_size) - self.rank = rank - - def send_receive(self, tensors, name, ctx): - if not self.compressor.tensors_size_are_same: - raise NotImplemented() - - tensors_decompressed = [] - for root_rank in range(self.world_size): - if root_rank == self.rank: - broadcasted = tensors - for tensor_compressed in tensors: - dist.broadcast(tensor_compressed, root_rank) - else: - broadcasted = [] - for tensor_compressed in tensors: - tensor = torch.empty_like(tensor_compressed) - dist.broadcast(tensor, root_rank) - broadcasted.append(tensor) - tensor_decompressed = self.compressor.decompress(broadcasted, ctx) - tensors_decompressed.append(tensor_decompressed) - - tensor_aggregated = self.compressor.aggregate(tensors_decompressed) - return (tensor_aggregated / self.world_size) if self.compressor.average else tensor_aggregated diff --git a/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat.cpp b/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat.cpp deleted file mode 100644 index 10b29ea..0000000 --- a/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include - -#include - -// CUDA forward declarations - -torch::Tensor cnat_compress_cuda(torch::Tensor input); -torch::Tensor cnat_compress_deterministic_cuda(torch::Tensor input); -torch::Tensor cnat_decompress_cuda(torch::Tensor input); - -// C++ interface - -torch::Tensor cnat_compress(torch::Tensor input) { - return cnat_compress_cuda(input); -} - -torch::Tensor cnat_compress_deterministic(torch::Tensor input) { - return cnat_compress_deterministic_cuda(input); -} - -torch::Tensor cnat_decompress(torch::Tensor input) { - return cnat_decompress_cuda(input); -} - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("compress", &cnat_compress, "CNat compress (CUDA)"); - m.def("compress_deterministic", &cnat_compress_deterministic, "CNat compress deterministic (CUDA)"); - m.def("decompress", &cnat_decompress, "CNat decompress (CUDA)"); -} diff --git a/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat_cuda.cu b/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat_cuda.cu deleted file mode 100644 index 88cc6c3..0000000 --- a/untitled folder/grace_dl/dist/compressor/cnat_cuda/cnat_cuda.cu +++ /dev/null @@ -1,184 +0,0 @@ -#include - -#include -#include -#include - -namespace { -__constant__ __device__ uint8_t sign_and_exp_to_encoding[512] = -{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, - 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, - 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, - 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, - 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, - 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, - 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, -109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, -123, 124, 125, 126, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, -127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, -127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, -127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, -127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, -127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, -127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, -127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, -127, 127, 127, 127, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, -128, 128, 128, 128, 128, 128, 128, 128, 129, 130, 131, 132, 133, 134, -135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, -149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, -163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, -177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, -191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, -205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, -219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, -233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, -247, 248, 249, 250, 251, 252, 253, 254, 255, 255, 255, 255, 255, 255, -255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, -255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, -255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, -255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, -255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, -255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, -255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, -255, 255, 255, 255, 255, 255, 255, 255}; - -__constant__ __device__ uint32_t encoding_to_sign_and_exp[256] = -{ 0, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, - 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, - 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, - 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, - 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, - 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, -101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, -115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, -129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, -143, 144, 256, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, -285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, -299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, -313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, -327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, -341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, -355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, -369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, -383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, -397, 398, 399, 400}; - -__global__ void cnat_compress_cuda_kernel( - float* __restrict__ input, - uint8_t* __restrict__ output, - const float* __restrict__ rand, - int64_t len) { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - if(index < len) { - if (input[index] == 0) - output[index] = 0; - else { - int exp; - float prob = abs(frexpf(input[index], &exp)) / 0.5 - 1.; // [0.5, 1) -> [0, 1) - if (rand[index] >= prob) exp -= 1; - - if (input[index] < 0) exp += 383; // 256+127 - else exp += 127; - output[index] = sign_and_exp_to_encoding[exp]; - - //exp += 127; - //uint8_t encode; - //if (exp<=17) encode = 0; - //else if (exp<=143) encode = uint8_t(exp-17); - //else encode = 127; - //if (input[index] < 0) encode += 256; - //output[index] = encode; - } - } -} - -__global__ void cnat_compress_deterministic_cuda_kernel( - float* __restrict__ input, - uint8_t* __restrict__ output, - int64_t len) { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - if(index < len) { - if (input[index] == 0) - output[index] = 0; - else { - int exp; - float prob = abs(frexpf(input[index], &exp)) / 0.5 - 1.; // [0.5, 1) -> [0, 1) - if (0.5 >= prob) exp -= 1; - - if (input[index] < 0) exp += 383; // 256+127 - else exp += 127; - output[index] = sign_and_exp_to_encoding[exp]; - - //exp += 127; - //uint8_t encode; - //if (exp<=17) encode = 0; - //else if (exp<=143) encode = uint8_t(exp-17); - //else encode = 127; - //if (input[index] < 0) encode += 256; - //output[index] = encode; - } - } -} - -__global__ void cnat_decompress_cuda_kernel( - uint8_t* __restrict__ input, - float* __restrict__ output, - int64_t len) { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - if(index < len) { - uint32_t sign_and_exp = encoding_to_sign_and_exp[input[index]] << 23; - output[index] = reinterpret_cast(sign_and_exp); - } -} - -} // namespace - -torch::Tensor cnat_compress_cuda(torch::Tensor input) { - auto output = torch::empty_like(input, torch::TensorOptions().dtype(torch::kUInt8).device(input.device())); - auto rand = torch::rand_like(input, torch::TensorOptions().device(input.device())); // [0, 1) - const int threads = 1024; - int64_t numel = input.numel(); - auto blocks = numel/threads; - if (numel%threads || !blocks) blocks++; - - cnat_compress_cuda_kernel<<>>( - input.data_ptr(), - output.data_ptr(), - rand.data_ptr(), - numel); - - return output; -} - -torch::Tensor cnat_compress_deterministic_cuda(torch::Tensor input) { - auto output = torch::empty_like(input, torch::TensorOptions().dtype(torch::kUInt8).device(input.device())); - const int threads = 1024; - int64_t numel = input.numel(); - auto blocks = numel/threads; - if (numel%threads || !blocks) blocks++; - - cnat_compress_deterministic_cuda_kernel<<>>( - input.data_ptr(), - output.data_ptr(), - numel); - - return output; -} - -torch::Tensor cnat_decompress_cuda(torch::Tensor input) { - auto output = torch::empty_like(input, torch::TensorOptions().dtype(torch::kFloat32).device(input.device())); - const int threads = 1024; - int64_t numel = input.numel(); - auto blocks = numel/threads; - if (numel%threads || !blocks) blocks++; - - cnat_decompress_cuda_kernel<<>>( - input.data_ptr(), - output.data_ptr(), - numel); - - return output; -} - diff --git a/untitled folder/grace_dl/dist/compressor/cnat_cuda/setup.py b/untitled folder/grace_dl/dist/compressor/cnat_cuda/setup.py deleted file mode 100644 index 9b564c5..0000000 --- a/untitled folder/grace_dl/dist/compressor/cnat_cuda/setup.py +++ /dev/null @@ -1,14 +0,0 @@ -from setuptools import setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -setup( - name='cnat_cuda', - ext_modules=[ - CUDAExtension('cnat_cuda', [ - 'cnat.cpp', - 'cnat_cuda.cu', - ]), - ], - cmdclass={ - 'build_ext': BuildExtension - }) diff --git a/untitled folder/grace_dl/dist/compressor/dgc.py b/untitled folder/grace_dl/dist/compressor/dgc.py deleted file mode 100644 index 31224d5..0000000 --- a/untitled folder/grace_dl/dist/compressor/dgc.py +++ /dev/null @@ -1,50 +0,0 @@ -import torch - -from grace_dl.dist import Compressor - - -class DgcCompressor(Compressor): - - def __init__(self, compress_ratio): - super().__init__(tensors_size_are_same=False) - self.compress_ratio = compress_ratio - - def compress(self, tensor, name): - shape = tensor.size() - tensor = tensor.flatten() - numel = tensor.numel() - - sample_shape = [max(1, int(numel * 0.01))] - sample_index = torch.empty(sample_shape).uniform_(0, numel).type(torch.long) - sample_tensor = tensor[sample_index] - - k = max(1, int(numel * self.compress_ratio * 0.01)) - vals, indices = torch.topk(sample_tensor.abs(), k) - - thr = vals.min() - mask = tensor.abs() >= thr - selected = mask.sum() - - for _ in range(10): - if selected > 1.3 * numel * self.compress_ratio: - thr = 1.3 * thr - elif selected < 0.7 * numel * self.compress_ratio: - thr = 0.7 * thr - else: - break - mask = tensor.abs() >= thr - selected = mask.sum() - - indices, = torch.where(mask) - values = tensor[indices] - - tensor_compressed = values, indices - ctx = shape, mask, numel - return tensor_compressed, ctx - - def decompress(self, tensor_compressed, ctx): - values, indices = tensor_compressed - shape, _, numel = ctx - tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) - tensor_decompressed.scatter_(0, indices, values) - return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/dist/compressor/efsignsgd.py b/untitled folder/grace_dl/dist/compressor/efsignsgd.py deleted file mode 100644 index 443bf06..0000000 --- a/untitled folder/grace_dl/dist/compressor/efsignsgd.py +++ /dev/null @@ -1,33 +0,0 @@ -import torch - -from grace_dl.dist import Compressor - - -class EFSignSGDCompressor(Compressor): - - def __init__(self, lr): - super().__init__(average=False) - self.learning_rate = lr - - def compress(self, tensor, name): - """Encoding and compressing the signs """ - shape = tensor.size() - tensor = tensor.flatten() - - sign_encode = tensor >= 0 - mean = tensor.abs().mean() - tensor_compressed = mean, sign_encode.type(torch.uint8) - - return tensor_compressed, shape - - def decompress(self, tensor_compressed, shape): - """Decoding the signs to float format """ - mean, sign_encode = tensor_compressed - sign_decode = sign_encode.type(torch.float32) * 2 - 1 - sign_decode = mean * sign_decode - tensor_decompressed = sign_decode.view(shape) - return tensor_decompressed - - def aggregate(self, tensors): - """Aggregate a list of tensors.""" - return sum(tensors) / self.learning_rate diff --git a/untitled folder/grace_dl/dist/compressor/fp16.py b/untitled folder/grace_dl/dist/compressor/fp16.py deleted file mode 100644 index 1bdd20c..0000000 --- a/untitled folder/grace_dl/dist/compressor/fp16.py +++ /dev/null @@ -1,22 +0,0 @@ -import torch - -from grace_dl.dist import Compressor - - -class FP16Compressor(Compressor): - """Compress all floating point gradients to 16-bit.""" - - def compress(self, tensor, name): - """Downcasts the tensor to 16-bit.""" - dtype = tensor.dtype - if dtype.is_floating_point: - # Only allow compression from other floating point types - tensor = tensor.type(torch.float16) - return [tensor], dtype - - def decompress(self, tensors, dtype): - """Upcasts the tensor to the initialization dtype.""" - tensor_decompressed, = tensors - if dtype.is_floating_point: - tensor_decompressed = tensor_decompressed.type(dtype) - return tensor_decompressed diff --git a/untitled folder/grace_dl/dist/compressor/natural.py b/untitled folder/grace_dl/dist/compressor/natural.py deleted file mode 100644 index 0dd29ec..0000000 --- a/untitled folder/grace_dl/dist/compressor/natural.py +++ /dev/null @@ -1,57 +0,0 @@ -# conda install -c conda-forge cupy=7.0.0=py37h0c141eb_2 -from torch.utils.dlpack import to_dlpack -from torch.utils.dlpack import from_dlpack - -from grace_dl.torch import Compressor - - -class NaturalCompressor(Compressor): - def __init__(self): - super().__init__() - - def compress(self, tensor, name): - import cupy - shape = tensor.size() - tensor_flatten = tensor.flatten() - cupy_tensor = cupy.fromDlpack(to_dlpack(tensor_flatten)) - tensor_cast = cupy_tensor.view(cupy.int32) - sign = tensor_cast & cupy.int32(0b10000000000000000000000000000000) - exp = tensor_cast & cupy.int32(0b01111111100000000000000000000000) - mantissa = tensor_cast & cupy.int32(0b00000000011111111111111111111111) - exp_add_one = mantissa > cupy.random.randint(low=0, high=0b00000000011111111111111111111111, - size=cupy_tensor.shape, - dtype=cupy.int32) - exponent = cupy.where(exp_add_one, exp + 0b00000000100000000000000000000000, exp) - exp_shift = cupy.clip(exponent, a_min=0b00001001000000000000000000000000, a_max=0b01001000100000000000000000000000) - exps = cupy.right_shift(exp_shift, 23) - exps = cupy.bitwise_or(cupy.right_shift(sign, 24), exps - 18) - tensor_compressed = exps.astype(cupy.uint8) - return [from_dlpack(tensor_compressed.toDlpack())], shape - - def decompress(self, tensor_compressed, shape): - import cupy - tensor_compressed, = tensor_compressed - cupy_tensor = cupy.fromDlpack(to_dlpack(tensor_compressed)) - sign = cupy_tensor > 127 - exps = cupy.bitwise_and(cupy_tensor, 0b01111111) - floats = cupy.left_shift((exps + 18).astype(cupy.int32), 23).view(cupy.float32) - tensor_decompressed = cupy.where(sign, -floats, floats) - tensor_decompressed = cupy.multiply((exps >= 1).astype(cupy.float32), tensor_decompressed) - return from_dlpack(tensor_decompressed.toDlpack()).view(shape) - - -class NaturalCompressor_CUDA(Compressor): - def __init__(self): - super().__init__() - - def compress(self, tensor, name): - import cnat_cuda - shape = tensor.size() - tensor_compressed = cnat_cuda.compress(tensor.flatten()) - return [tensor_compressed, ], shape - - def decompress(self, tensor_compressed, shape): - import cnat_cuda - tensor_compressed, = tensor_compressed - tensor_decompressed = cnat_cuda.decompress(tensor_compressed) - return tensor_decompressed.view(shape) \ No newline at end of file diff --git a/untitled folder/grace_dl/dist/compressor/none.py b/untitled folder/grace_dl/dist/compressor/none.py deleted file mode 100644 index 02d7bb0..0000000 --- a/untitled folder/grace_dl/dist/compressor/none.py +++ /dev/null @@ -1,12 +0,0 @@ -from grace_dl.dist import Compressor - - -class NoneCompressor(Compressor): - """Default no-op compression.""" - - def compress(self, tensor, name): - return [tensor], None - - def decompress(self, tensors, ctx): - tensor, = tensors - return tensor diff --git a/untitled folder/grace_dl/dist/compressor/onebit.py b/untitled folder/grace_dl/dist/compressor/onebit.py deleted file mode 100644 index 5558b18..0000000 --- a/untitled folder/grace_dl/dist/compressor/onebit.py +++ /dev/null @@ -1,31 +0,0 @@ -import torch - -from grace_dl.dist import Compressor - - -class OneBitCompressor(Compressor): - - def compress(self, tensor, name): - shape = tensor.size() - tensor = tensor.flatten() - numel = tensor.numel() - - mask0 = tensor < 0 - sum0 = torch.sum(tensor[mask0]) - num0 = torch.sum(mask0).float() - mean0 = sum0 / num0 if num0 > 0 else sum0 - - mask1 = ~mask0 - sum1 = torch.sum(tensor[mask1]) - num1 = numel - num0 - mean1 = sum1 / num1 if num1 > 0 else sum1 - - tensor_compressed = mask0.type(torch.uint8), mean0, mean1 - - return tensor_compressed, shape - - def decompress(self, tensor_compressed, shape): - mask0, mean0, mean1 = tensor_compressed - tensor_decompressed = mask0 * mean0 + ~mask0 * mean1 - tensor_decompressed = tensor_decompressed.view(shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/dist/compressor/powersgd.py b/untitled folder/grace_dl/dist/compressor/powersgd.py deleted file mode 100644 index 3912195..0000000 --- a/untitled folder/grace_dl/dist/compressor/powersgd.py +++ /dev/null @@ -1,65 +0,0 @@ -import torch -from torch import distributed as dist - -from grace_dl.dist import Compressor - - -@torch.jit.script -def orthogonalize(matrix): - n, m = matrix.shape - for i in range(m): - # Normalize the i'th column - col = matrix[:, i: i + 1] - col /= torch.sqrt(torch.sum(col ** 2)) - # Project it on the rest and remove it - if i + 1 < m: - rest = matrix[:, i + 1:] - # rest -= torch.matmul(col.t(), rest) * col - rest -= torch.sum(col * rest, dim=0) * col - - -class PowerSGDCompressor(Compressor): - - def __init__(self, rank=1, use_memory=False, world_size=1): - super().__init__() - self.world_size = world_size - self.q_memory = {} - self.rank = rank - self.use_memory = use_memory - - def compress(self, tensor, name): - if tensor.dim() == 1: - return [tensor], None - - shape = tensor.size() - matrix = tensor.view([shape[0], -1]) - n, m = matrix.size() - r = min(n, m, self.rank) - if self.use_memory and name in self.q_memory: - q = self.q_memory[name] - else: - q = torch.empty(m, r, dtype=matrix.dtype, layout=matrix.layout, device=matrix.device).normal_() - # q, _ = torch.qr(q) - orthogonalize(q) - - p = torch.mm(matrix, q) - dist.all_reduce(p) - p = p / self.world_size - # p, _ = torch.qr(p) - orthogonalize(p) - q = torch.mm(matrix.t(), p) - dist.all_reduce(q) - q = q / self.world_size - ctx = p, q, shape - if self.use_memory: - self.q_memory[name] = q - return [], ctx - - def decompress(self, tensors, ctx): - if ctx is None: - tensor, = tensors - return tensor - p, q, tensor_shape = ctx - new_tensor = torch.mm(p, q.t()) - tensor_decompressed = new_tensor.view(tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/dist/compressor/qsgd.py b/untitled folder/grace_dl/dist/compressor/qsgd.py deleted file mode 100644 index 7922bf8..0000000 --- a/untitled folder/grace_dl/dist/compressor/qsgd.py +++ /dev/null @@ -1,79 +0,0 @@ -import torch -from grace_dl.dist import Compressor - - -class QSGDCompressor(Compressor): - - def __init__(self, quantum_num, bucket_size=128): - super().__init__() - self.quantum_num = quantum_num - self.bucket_size = bucket_size - - def compress(self, tensor, name): - shape = tensor.size() - tensor = tensor.flatten() - abs_gradient = tensor.abs() - - if tensor.numel() % self.bucket_size != 0: - pad_size = self.bucket_size - tensor.numel() % self.bucket_size - pad_tensor = torch.cat([tensor, torch.zeros(pad_size, dtype=tensor.dtype, device=tensor.device)]) - else: - pad_tensor = tensor - pad_tensor = pad_tensor.view([-1, self.bucket_size]) - pad_tensor_sqsum = torch.sum(pad_tensor ** 2, dim=1) - bucket_norm = torch.sqrt(pad_tensor_sqsum) - b = torch.ones([1, self.bucket_size], device=tensor.device) - expand_norm = torch.matmul(bucket_norm.view([-1, 1]), b) - norm = expand_norm.flatten()[:tensor.numel()] - - level_float = self.quantum_num / norm * abs_gradient - previous_level = level_float.floor() - prob = torch.empty_like(tensor).uniform_() - is_next_level = (prob < (level_float - previous_level)).type(torch.float32) - new_level = (previous_level + is_next_level) - - sign = tensor.sign() - tensor_compressed = (new_level * sign).type(torch.int16) - tensor_compressed = tensor_compressed.type(torch.int8 if self.quantum_num < 128 else torch.half) - - return (tensor_compressed, bucket_norm), shape - - def decompress(self, tensor_compressed, ctx): - tensor_compressed, bucket_norm = tensor_compressed - shape = ctx - b = torch.ones([1, self.bucket_size], device=tensor_compressed.device) - expand_norm = torch.matmul(bucket_norm.view([-1, 1]), b) - norm = expand_norm.flatten()[:shape.numel()] - decode_output = tensor_compressed.type(torch.float32) - tensor_decompressed = norm / self.quantum_num * decode_output - tensor_decompressed = tensor_decompressed.view(shape) - - return tensor_decompressed - - -class QSGDCompressor_CUDA(Compressor): - - def __init__(self, quantum_num, bucket_size=128): - super().__init__() - self.quantum_num = quantum_num - self.bucket_size = bucket_size - - def compress(self, tensor, name): - import qsgd_cuda - shape = tensor.size() - tensor = tensor.flatten() - - tensor_compressed, bucket_norm = qsgd_cuda.compress(tensor, self.quantum_num, self.bucket_size) - tensor_compressed = tensor_compressed, bucket_norm.float() - - return tensor_compressed, shape - - def decompress(self, tensor_compressed, shape): - import qsgd_cuda - tensor_compressed, bucket_norm = tensor_compressed - - tensor_decompressed = qsgd_cuda.decompress(tensor_compressed, bucket_norm.double(), self.quantum_num, self.bucket_size) - - return tensor_decompressed.view(shape) - - diff --git a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/example.py b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/example.py deleted file mode 100644 index 6814041..0000000 --- a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/example.py +++ /dev/null @@ -1,16 +0,0 @@ -# Compilation cmd: python setup.py install -# Alternative JIT compilation, refer to https://pytorch.org/tutorials/advanced/cpp_extension.html#jit-compiling-extensions -# from torch.utils.cpp_extension import load -# qsgd_cuda = load(name="qsgd_cuda", sources=["qsgd.cpp", "qsgd_cuda.cu"], verbose=True) - -import torch -import qsgd_cuda - -t = torch.rand(200000, dtype=torch.float32, device='cuda') -print(t) - -out, scaler = qsgd_cuda.compress(t, 127, 128) -print(out, scaler) - -out2 = qsgd_cuda.decompress(out, scaler, 127, 128) -print(out2) \ No newline at end of file diff --git a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd.cpp b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd.cpp deleted file mode 100644 index 6f11f5e..0000000 --- a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include -#include - -// CUDA forward declarations - -std::vector qsgd_compress_cuda(torch::Tensor input, int level, int bucket_size); - -torch::Tensor qsgd_decompress_cuda(torch::Tensor input,torch::Tensor scaler, int level, int bucket_size); - - -// C++ interface -std::vector qsgd_compress(torch::Tensor input, int level, int bucket_size) { - return qsgd_compress_cuda(input, level, bucket_size); -} - -torch::Tensor qsgd_decompress(torch::Tensor input,torch::Tensor scaler, int level, int bucket_size) { - return qsgd_decompress_cuda(input, scaler, level, bucket_size); -} - - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("compress", &qsgd_compress, "QSGD compression"); - m.def("decompress", &qsgd_decompress, "QSGD decompression"); -} diff --git a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd_cuda.cu b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd_cuda.cu deleted file mode 100644 index 16cac6c..0000000 --- a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/qsgd_cuda.cu +++ /dev/null @@ -1,538 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - - -using namespace std; - -#define EIGEN_USE_GPU -#define maxThreadsPerBlock 1024 - -__global__ void _qsgdreduceSumV2(float *g_odata, float *g_idata, unsigned int n) -{ - extern __shared__ float sdata[]; - - // set thread ID - unsigned int tid = threadIdx.x; - unsigned int gridSize = blockDim.x * gridDim.x; - unsigned int i = blockIdx.x * blockDim.x + tid; - unsigned int blockSize = blockDim.x; - - sdata[tid] = 0; - - while (i < n) { - sdata[tid] += g_idata[i];// + g_idata[i + blockDim.x]; - i += gridSize; - } - __syncthreads(); - - // in-place reduction and complete unroll - if (blockSize >= 1024) { - if (tid < 512) sdata[tid] += sdata[tid + 512]; - __syncthreads(); - } - if (blockSize >= 512) { - if (tid < 256) sdata[tid] += sdata[tid + 256]; - __syncthreads(); - } - if (blockSize >= 256) { - if (tid < 128) sdata[tid] += sdata[tid + 128]; - __syncthreads(); - } - if (blockSize >= 128) { - if (tid < 64) sdata[tid] += sdata[tid + 64]; - __syncthreads(); - } - - // unrolling warp - if (tid < 32) - { - volatile float *vsmem = sdata; - vsmem[tid] += vsmem[tid + 32]; - vsmem[tid] += vsmem[tid + 16]; - vsmem[tid] += vsmem[tid + 8]; - vsmem[tid] += vsmem[tid + 4]; - vsmem[tid] += vsmem[tid + 2]; - vsmem[tid] += vsmem[tid + 1]; - } - - // write result for this block to global mem - if (tid == 0) { - g_odata[blockIdx.x] = sdata[0]; - } -} - -__global__ void _qsgdreduceClipThresholdV2(float *g_odata, float *g_idata, unsigned int n) -{ - extern __shared__ float sdata[]; - - // set thread ID - unsigned int tid = threadIdx.x; - unsigned int gridSize = blockDim.x * gridDim.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int blockSize = blockDim.x; - - sdata[tid] = 0; - - while (i < n) { - if (isfinite(g_idata[i])) { - sdata[tid] += g_idata[i] * g_idata[i];// + g_idata[i + blockDim.x] * g_idata[i + blockDim.x]; - } - i += gridSize; - } - __syncthreads(); - - // in-place reduction and complete unroll - if (blockSize >= 1024) { - if (tid < 512) sdata[tid] += sdata[tid + 512]; - __syncthreads(); - } - if (blockSize >= 512) { - if (tid < 256) sdata[tid] += sdata[tid + 256]; - __syncthreads(); - } - if (blockSize >= 256) { - if (tid < 128) sdata[tid] += sdata[tid + 128]; - __syncthreads(); - } - if (blockSize >= 128) { - if (tid < 64) sdata[tid] += sdata[tid + 64]; - __syncthreads(); - } - - // unrolling warp - if (tid < 32) - { - volatile float *vsmem = sdata; - vsmem[tid] += vsmem[tid + 32]; - vsmem[tid] += vsmem[tid + 16]; - vsmem[tid] += vsmem[tid + 8]; - vsmem[tid] += vsmem[tid + 4]; - vsmem[tid] += vsmem[tid + 2]; - vsmem[tid] += vsmem[tid + 1]; - } - - // write result for this block to global mem - if (tid == 0) { - g_odata[blockIdx.x] = sdata[0]; - } -} - -__global__ void _qsgdreduceAbsMaxV2(float *g_odata, float *g_idata, unsigned int n) -{ - extern __shared__ float sdata[]; - - // set thread ID - unsigned int tid = threadIdx.x; - unsigned int gridSize = blockDim.x * gridDim.x; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int blockSize = blockDim.x; - - sdata[tid] = 0; - - while (i < n) { - if (isfinite(g_idata[i]) && isfinite(sdata[tid])) - sdata[tid] = fmaxf(sdata[tid], fabsf(g_idata[i])); //fmaxf(fabsf(g_idata[i]), fabsf(g_idata[i + blockDim.x]))); - else - sdata[tid] = nanf("123"); - i += gridSize; - } - __syncthreads(); - - // in-place reduction and complete unroll - if (blockSize >= 1024) { - if (tid < 512) { - if (isfinite(sdata[tid]) && isfinite(sdata[tid + 512])) sdata[tid] = fmaxf(sdata[tid], sdata[tid + 512]); - else sdata[tid] = nanf("123"); - } - __syncthreads(); - } - if (blockSize >= 512) { - if (tid < 256) { - if (isfinite(sdata[tid]) && isfinite(sdata[tid + 256])) sdata[tid] = fmaxf(sdata[tid], sdata[tid + 256]); - else sdata[tid] = nanf("123"); - } - __syncthreads(); - } - if (blockSize >= 256) { - if (tid < 128) { - if (isfinite(sdata[tid]) && isfinite(sdata[tid + 128])) sdata[tid] = fmaxf(sdata[tid], sdata[tid + 128]); - else sdata[tid] = nanf("123"); - } - __syncthreads(); - } - if (blockSize >= 128) { - if (tid < 64) { - if (isfinite(sdata[tid]) && isfinite(sdata[tid + 64])) sdata[tid] = fmaxf(sdata[tid], sdata[tid + 64]); - else sdata[tid] = nanf("123"); - } - __syncthreads(); - } - - // unrolling warp - if (tid < 32) - { - volatile float *vsmem = sdata; - if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 32])) - vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 32]); - else vsmem[tid] = nanf("123"); - - if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 16])) - vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 16]); - else vsmem[tid] = nanf("123"); - - if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 8])) - vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 8]); - else vsmem[tid] = nanf("123"); - - if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 4])) - vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 4]); - else vsmem[tid] = nanf("123"); - - if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 2])) - vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 2]); - else vsmem[tid] = nanf("123"); - - if (isfinite(vsmem[tid]) && isfinite(vsmem[tid + 1])) - vsmem[tid] = fmaxf(vsmem[tid], vsmem[tid + 1]); - else vsmem[tid] = nanf("123"); - - } - - // write result for this block to global mem - if (tid == 0) { - g_odata[blockIdx.x] = sdata[0]; - } -} - -__global__ void _qsgdcomputeSqrt(float *scaler) -{ - *scaler = sqrt(*scaler); - //printf("l2 norm result: %f\n", *scaler); - //__syncthreads(); -} - -__global__ void _qsgdinitCURand(unsigned int len, unsigned int seed, curandState* states) -{ - unsigned int index = threadIdx.x + blockIdx.x * blockDim.x; - /* we have to initialize the state */ - if (index < len) - curand_init(seed + index, /* the seed can be the same for each core, here we pass the time in from the CPU */ - 0, /* the sequence number should be different for each core (unless you want all - cores to get the same sequence of numbers for some reason - use thread id! */ - 0, /* the offset is how much extra we advance in the sequence for each call, can be 0 */ - &states[index]); -} - -__global__ void _qsgdcompensateMemory(float *dst, const float *src, const float *local_mem, int len) -{ - unsigned int index = threadIdx.x + blockIdx.x * blockDim.x; - unsigned int stride = gridDim.x * blockDim.x; - - for (int i = index; i < len; i += stride){ - if (isfinite(src[i])) { - //dst[i] = src[i]; // + local_mem[i]; //remove memory compensation for comparison purposes. - dst[i] = src[i] + local_mem[i]; - } - else { - dst[i] = nanf("123"); - } - //printf("CompensateMemory result: idx=%d, src=%f, mem=%f, dst=%f\n", i, src[i], local_mem[i], dst[i]); - //__syncthreads(); - } -} - -__global__ void _qsgdTernarizeValue(int8_t *dst, const float *src, float *scaler, float *local_mem, const int len, int level, curandState* states) -{ - unsigned int index = threadIdx.x + blockIdx.x * blockDim.x; - unsigned int stride = gridDim.x * blockDim.x; - curandState local_state = states[index]; - float norm_scaler = *scaler; - - // The input tensor here has been clipped. - // Hence we have the ternarize formula: dst[i] = new_level[i] * sign(src[i]) - for (int i = index; i < len; i += stride) { - if (isfinite(norm_scaler) && isfinite(src[i])) { - float rand_sample = curand_uniform(&local_state); - float level_float = (float)level / norm_scaler * fabsf(src[i]); - int8_t previous_level = floor(level_float); - if (rand_sample < level_float - previous_level) { - dst[i] = previous_level + 1; // 1 is required by qsgd - } - else { - dst[i] = previous_level; - } - if (src[i] < 0){ - dst[i] = -dst[i]; - } - - // update local memory - local_mem[i] = src[i] - norm_scaler / (float)level * (float)dst[i]; // remove vanilla local memory update for comparison purposes. - } - else { - // encode value to the minimum for Inf or NaN - dst[i] = -128; - } - //printf("compressed result: idx=%d, scaler=%f, src=%f, dst=%d, update_mem=%f\n", i, *scaler, src[i], dst[i], local_mem[i]); - //__syncthreads(); - } -} - - -// For qsgd allreduce -// __global__ void _qsgdDeternarizeValue(int len, float *dst, int8_t *src, float *scaler, int level) -// { -// int index = blockIdx.x * blockDim.x + threadIdx.x; -// int stride = blockDim.x * gridDim.x; -// float norm_scaler = *scaler; - -// for (int i = index; i < len; i += stride) -// { -// dst[i] = norm_scaler / (float)level * (float)src[i]; -// } -// } - - -// For qsgd allgather -__global__ void _qsgdDeternarizeAndAdd(int len, float *dst, int8_t *src, float *scaler, int level) -{ - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - float norm_scaler = *scaler; - - for (int i = index; i < len; i += stride) { - if (src[i] == -128) { - dst[i] = nanf("123"); - } - else { - dst[i] += norm_scaler / (float)level * (float)src[i]; - } - //printf("decompressed result: idx=%d, scaler=%f, src=%d, dst=%f\n", i, *scaler, src[i], dst[i]); - //__syncthreads(); - } -} - -__global__ void _bucket_l2norm(const int len, double *dst, float *src, const int bucket_size) -{ - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - const int loop_times = len / bucket_size; - const int remain_nums = len % bucket_size; - - for (int i = index; i < loop_times; i += stride) - { -#pragma unroll - for (int j = 0; j < bucket_size; j ++){ - if (isfinite(src[bucket_size*i+j])) { - dst[i] += (double)(src[bucket_size*i+j]) * (double)(src[bucket_size*i+j]); - } - } - dst[i] = sqrt(dst[i]); - } - if (remain_nums && index == loop_times){ -#pragma unroll - for (int i = 0; i < remain_nums; i++){ - if (isfinite(src[bucket_size*loop_times+i])) { - dst[loop_times] += (double)(src[bucket_size*loop_times+i]) * (double)(src[bucket_size*loop_times+i]); - } - } - dst[loop_times] = sqrt(dst[loop_times]); - } - -} - - - -__global__ void _bucket_qsgdTernarizeValue(int8_t *dst, const float *src, double *scaler, const int len, int level, const int bucket_size, unsigned int seed) -{ - unsigned int index = threadIdx.x + blockIdx.x * blockDim.x; - unsigned int stride = gridDim.x * blockDim.x; - // curandState local_state = states[index]; - curandState local_state; - - - // The input tensor here has been clipped. - // Hence we have the ternarize formula: dst[i] = new_level[i] * sign(src[i]) - for (int i = index; i < len; i += stride) { - float norm_scaler = (float)(scaler[i/bucket_size]); - curand_init(seed + index, 0, 0, &local_state); - if (isfinite(norm_scaler) && isfinite(src[i])) { - float rand_sample = curand_uniform(&local_state); - float level_float = (float)level / norm_scaler * fabsf(src[i]); - int8_t previous_level = floor(level_float); - if (rand_sample < level_float - previous_level) { - dst[i] = previous_level + 1; // 1 is required by qsgd - } - else { - dst[i] = previous_level; - } - if (src[i] < 0){ - dst[i] = -dst[i]; - } - - // update local memory - //local_mem[i] = src[i] - norm_scaler / (float)level * (float)dst[i]; // remove vanilla local memory update for comparison purposes. - } - else { - // encode value to the minimum for Inf or NaN - dst[i] = -128; - } - //printf("compressed result: idx=%d, scaler=%f, src=%f, dst=%d, update_mem=%f\n", i, *scaler, src[i], dst[i], local_mem[i]); - //__syncthreads(); - } -} - -// For qsgd allgather -__global__ void _bucket_qsgdDeternarizeAndAdd(int len, float *dst, int8_t *src, double *scaler, int level, const int bucket_size) -{ - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for (int i = index; i < len; i += stride) { - float norm_scaler = (float)(scaler[i/bucket_size]); - if (src[i] == -128) { - dst[i] = nanf("123"); - } - else { - dst[i] = norm_scaler / (float)level * (float)src[i]; - //atomicAdd(dst+i, norm_scaler / (float)level * (float)src[i]); - } - //printf("decompressed result: idx=%d, scaler=%f, src=%d, dst=%f\n", i, *scaler, src[i], dst[i]); - //__syncthreads(); - } -} - - -/*----------------------------------- Reduce Wrapper --------------------------------------------*/ -void qsgdGPUReduce(int len, float *d_out, float *d_intermediate_res, float *result, int whichKernel, cudaStream_t stream) { - // d_intermediate_res holds the input - // setting up blocks - int numBlocks = (int) ceil(1.0 * len / maxThreadsPerBlock); //(len / maxThreadsPerBlock) + 1; - - int prevNumBlocks = len; - // recursively reduce to get the result - while (numBlocks > maxThreadsPerBlock) { - // clear d_out - cudaMemset(d_out, 0, numBlocks * sizeof(float)); - - switch (whichKernel) { - // reduce sum - case 0: - _qsgdreduceSumV2<<>>(d_out, d_intermediate_res, len); - break; - // reduce absmax - case 1: - _qsgdreduceAbsMaxV2<<>>(d_out, d_intermediate_res, len); - break; - // reduce clip threshold - case 2: - _qsgdreduceClipThresholdV2<<>>(d_out, d_intermediate_res, len); - // we don't need to square the intermediate results. - whichKernel = 0; - break; - default: - break; - } - - // by now, d_out holds the intermediate result, copy it to intermedaite_res for the next run - cudaMemcpy(d_intermediate_res, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToDevice); - // compute reduced problem size - prevNumBlocks = numBlocks; - len = numBlocks; - numBlocks = (int) ceil(1.0 * numBlocks / maxThreadsPerBlock); //numBlocks / maxThreadsPerBlock + 1; - } - - // use one block to compute the rest. - // clear d_out - cudaMemset(d_out, 0, prevNumBlocks* sizeof(float)); - switch (whichKernel) { - // reduce sum - case 0: - _qsgdreduceSumV2<<<1, maxThreadsPerBlock, maxThreadsPerBlock * sizeof(float)>>>(d_out, d_intermediate_res, prevNumBlocks); - break; - // reduce absmax - case 1: - _qsgdreduceAbsMaxV2<<<1, maxThreadsPerBlock, maxThreadsPerBlock * sizeof(float)>>>(d_out, d_intermediate_res, prevNumBlocks); - break; - // reduce clip threshold - case 2: - _qsgdreduceClipThresholdV2<<<1, maxThreadsPerBlock, maxThreadsPerBlock * sizeof(float)>>>(d_out, d_intermediate_res, prevNumBlocks); - break; - default: - break; - } - // as we just use one block, just move the first element of d_out to result - cudaMemcpy(result, d_out, sizeof(float), cudaMemcpyDeviceToDevice); -} - -/*----------------------------------- Kernel Launch Wrappers ------------------------------------*/ - - -void GPUReduceL2Norm(float *array, int len, double *l2norm_scaler, const int bucket_size) -{ - int blocksPerGrid = (int) ceil(1.0 * len / maxThreadsPerBlock); - _bucket_l2norm<<>>(len, l2norm_scaler, array, bucket_size); -} - -// void qsgdGPUInit_curand(int n, unsigned int seed, curandState* cuda_states) -// { -// int blocksPerGrid = (int) ceil(1.0 * n / maxThreadsPerBlock); -// _qsgdinitCURand<<>>(n, seed, cuda_states); -// } - -// void qsgdGPUCompensateMemory(float *dst, const float *src, const float* local_mem, int len) -// { -// int blocksPerGrid = (int) ceil(1.0 * len / maxThreadsPerBlock); -// _qsgdcompensateMemory<<>>(dst, src, local_mem, len); -// } - - - -void GPUTernarizeMultiLevelValue(int8_t *dst, const float *src, double *scaler, int len, int level, const int bucket_size) -{ - int blocksPerGrid = (int) ceil(1.0 * std::min(len, 1024 * 1024 * 25) / maxThreadsPerBlock); - unsigned int seed = time(NULL); - _bucket_qsgdTernarizeValue<<>>(dst, src, scaler, len, level, bucket_size, seed); -} - -void GPUDeternarizeMultiLevelValue(int len, float *dst, int8_t *src, double *scaler, int level, const int bucket_size) -{ - int blocksPerGrid = (int) ceil(1.0 * len / maxThreadsPerBlock); - _bucket_qsgdDeternarizeAndAdd<<>>(len, dst, src, scaler, level, bucket_size); -} - - - - -std::vector qsgd_compress_cuda(torch::Tensor input, int level, int bucket_size) { - - int element_nums = input.numel(); - int num_buckets = ceil((float)element_nums / bucket_size); - auto d_l2norm_scaler = torch::zeros(num_buckets, torch::TensorOptions().dtype(torch::kFloat64).device(input.device())); - auto buffer_data = torch::empty(element_nums, torch::TensorOptions().dtype(torch::kInt8).device(input.device())); - -// curandState* cuda_states; -// cuda_states = (curandState*)torch::empty(element_nums, torch::TensorOptions().dtype(torch::kInt).device(input.device())).data_ptr(); -// qsgdGPUInit_curand(element_nums, time(NULL), cuda_states); - - GPUReduceL2Norm((float*)input.data_ptr(), element_nums, (double*)d_l2norm_scaler.data_ptr(), bucket_size); - GPUTernarizeMultiLevelValue((int8_t*)buffer_data.data_ptr(), (float*)input.data_ptr(), (double*)d_l2norm_scaler.data_ptr(), - element_nums, level, bucket_size); - - return {buffer_data, d_l2norm_scaler}; -} - - -torch::Tensor qsgd_decompress_cuda(torch::Tensor input, torch::Tensor d_l2norm_scaler, int level, int bucket_size) { - int element_nums = input.numel(); - int num_buckets = ceil((float)element_nums / bucket_size); - auto buffer_data = torch::empty(element_nums, torch::TensorOptions().dtype(torch::kFloat32).device(input.device())); - GPUDeternarizeMultiLevelValue(element_nums, (float*)buffer_data.data_ptr(), (int8_t*)input.data_ptr(), - (double*)d_l2norm_scaler.data_ptr(), level, bucket_size); - return buffer_data; -} \ No newline at end of file diff --git a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/setup.py b/untitled folder/grace_dl/dist/compressor/qsgd_cuda/setup.py deleted file mode 100644 index 9542698..0000000 --- a/untitled folder/grace_dl/dist/compressor/qsgd_cuda/setup.py +++ /dev/null @@ -1,14 +0,0 @@ -from setuptools import setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -setup( - name='qsgd_cuda', - ext_modules=[ - CUDAExtension('qsgd_cuda', [ - 'qsgd.cpp', - 'qsgd_cuda.cu', - ]), - ], - cmdclass={ - 'build_ext': BuildExtension - }) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_histogram.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_histogram.cuh deleted file mode 100644 index 3b6cc4c..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_histogram.cuh +++ /dev/null @@ -1,787 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . - */ - -#pragma once - -#include - -#include "../util_type.cuh" -#include "../block/block_load.cuh" -#include "../grid/grid_queue.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy - ******************************************************************************/ - -/** - * - */ -enum BlockHistogramMemoryPreference -{ - GMEM, - SMEM, - BLEND -}; - - -/** - * Parameterizable tuning policy type for AgentHistogram - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming - BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) - bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue -struct AgentHistogramPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) - IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming - MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) - IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . - */ -template < - typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type - int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory. - int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels. - int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename SampleIteratorT, ///< Random-access input iterator type for reading samples - typename CounterT, ///< Integer type for counting sample occurrences per histogram bin - typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel - typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel - typename OffsetT, ///< Signed integer type for global offsets - int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability -struct AgentHistogram -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// The sample type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - /// The pixel type of SampleT - typedef typename CubVector::Type PixelT; - - /// The quad type of SampleT - typedef typename CubVector::Type QuadT; - - /// Constants - enum - { - BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS, - - PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD, - SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS, - QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4, - - TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS, - TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS, - - IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS, - - MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? - AgentHistogramPolicyT::MEM_PREFERENCE : - GMEM, - - IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING, - }; - - /// Cache load modifier for reading input elements - static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER; - - - /// Input iterator wrapper type (for applying cache modifier) - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - SampleIteratorT>::Type // Directly use the supplied input iterator type - WrappedSampleIteratorT; - - /// Pixel input iterator type (for applying cache modifier) - typedef CacheModifiedInputIterator - WrappedPixelIteratorT; - - /// Qaud input iterator type (for applying cache modifier) - typedef CacheModifiedInputIterator - WrappedQuadIteratorT; - - /// Parameterized BlockLoad type for samples - typedef BlockLoad< - SampleT, - BLOCK_THREADS, - SAMPLES_PER_THREAD, - AgentHistogramPolicyT::LOAD_ALGORITHM> - BlockLoadSampleT; - - /// Parameterized BlockLoad type for pixels - typedef BlockLoad< - PixelT, - BLOCK_THREADS, - PIXELS_PER_THREAD, - AgentHistogramPolicyT::LOAD_ALGORITHM> - BlockLoadPixelT; - - /// Parameterized BlockLoad type for quads - typedef BlockLoad< - QuadT, - BLOCK_THREADS, - QUADS_PER_THREAD, - AgentHistogramPolicyT::LOAD_ALGORITHM> - BlockLoadQuadT; - - /// Shared memory type required by this thread block - struct _TempStorage - { - CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding) - - int tile_idx; - - // Aliasable storage layout - union Aliasable - { - typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples - typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels - typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads - - } aliasable; - }; - - - /// Temporary storage type (unionable) - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - /// Reference to temp_storage - _TempStorage &temp_storage; - - /// Sample input iterator (with cache modifier applied, if possible) - WrappedSampleIteratorT d_wrapped_samples; - - /// Native pointer for input samples (possibly NULL if unavailable) - SampleT* d_native_samples; - - /// The number of output bins for each channel - int (&num_output_bins)[NUM_ACTIVE_CHANNELS]; - - /// The number of privatized bins for each channel - int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS]; - - /// Reference to gmem privatized histograms for each channel - CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS]; - - /// Reference to final output histograms (gmem) - CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS]; - - /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel - OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS]; - - /// The transform operator for determining privatized counter indices from samples, one for each channel - PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]; - - /// Whether to prefer privatized smem counters vs privatized global counters - bool prefer_smem; - - - //--------------------------------------------------------------------- - // Initialize privatized bin counters - //--------------------------------------------------------------------- - - // Initialize privatized bin counters - __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) - { - // Initialize histogram bin counts to zeros - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS) - { - privatized_histograms[CHANNEL][privatized_bin] = 0; - } - } - - // Barrier to make sure all threads are done updating counters - CTA_SYNC(); - } - - - // Initialize privatized bin counters. Specialized for privatized shared-memory counters - __device__ __forceinline__ void InitSmemBinCounters() - { - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; - - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; - - InitBinCounters(privatized_histograms); - } - - - // Initialize privatized bin counters. Specialized for privatized global-memory counters - __device__ __forceinline__ void InitGmemBinCounters() - { - InitBinCounters(d_privatized_histograms); - } - - - //--------------------------------------------------------------------- - // Update final output histograms - //--------------------------------------------------------------------- - - // Update final output histograms from privatized histograms - __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) - { - // Barrier to make sure all threads are done updating counters - CTA_SYNC(); - - // Apply privatized bin counts to output bin counts - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - int channel_bins = num_privatized_bins[CHANNEL]; - for (int privatized_bin = threadIdx.x; - privatized_bin < channel_bins; - privatized_bin += BLOCK_THREADS) - { - int output_bin = -1; - CounterT count = privatized_histograms[CHANNEL][privatized_bin]; - bool is_valid = count > 0; - - output_decode_op[CHANNEL].template BinSelect((SampleT) privatized_bin, output_bin, is_valid); - - if (output_bin >= 0) - { - atomicAdd(&d_output_histograms[CHANNEL][output_bin], count); - } - - } - } - } - - - // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters - __device__ __forceinline__ void StoreSmemOutput() - { - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; - - StoreOutput(privatized_histograms); - } - - - // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters - __device__ __forceinline__ void StoreGmemOutput() - { - StoreOutput(d_privatized_histograms); - } - - - //--------------------------------------------------------------------- - // Tile accumulation - //--------------------------------------------------------------------- - - // Accumulate pixels. Specialized for RLE compression. - __device__ __forceinline__ void AccumulatePixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD], - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], - Int2Type is_rle_compress) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - // Bin pixels - int bins[PIXELS_PER_THREAD]; - - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) - { - bins[PIXEL] = -1; - privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]); - } - - CounterT accumulator = 1; - - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL) - { - if (bins[PIXEL] != bins[PIXEL + 1]) - { - if (bins[PIXEL] >= 0) - atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator); - - accumulator = 0; - } - accumulator++; - } - - // Last pixel - if (bins[PIXELS_PER_THREAD - 1] >= 0) - atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator); - } - } - - - // Accumulate pixels. Specialized for individual accumulation of each pixel. - __device__ __forceinline__ void AccumulatePixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD], - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], - Int2Type is_rle_compress) - { - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - int bin = -1; - privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]); - if (bin >= 0) - atomicAdd(privatized_histograms[CHANNEL] + bin, 1); - } - } - } - - - /** - * Accumulate pixel, specialized for smem privatized histogram - */ - __device__ __forceinline__ void AccumulateSmemPixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD]) - { - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; - - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; - - AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type()); - } - - - /** - * Accumulate pixel, specialized for gmem privatized histogram - */ - __device__ __forceinline__ void AccumulateGmemPixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD]) - { - AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type()); - } - - - - //--------------------------------------------------------------------- - // Tile loading - //--------------------------------------------------------------------- - - // Load full, aligned tile using pixel iterator (multi-channel) - template - __device__ __forceinline__ void LoadFullAlignedTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels) - { - typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; - - WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); - - // Load using a wrapped pixel iterator - BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( - d_wrapped_pixels, - reinterpret_cast(samples)); - } - - // Load full, aligned tile using quad iterator (single-channel) - __device__ __forceinline__ void LoadFullAlignedTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type<1> num_active_channels) - { - typedef QuadT AliasedQuads[QUADS_PER_THREAD]; - - WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset)); - - // Load using a wrapped quad iterator - BlockLoadQuadT(temp_storage.aliasable.quad_load).Load( - d_wrapped_quads, - reinterpret_cast(samples)); - } - - // Load full, aligned tile - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type()); - } - - // Load full, mis-aligned tile using sample iterator - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; - - // Load using sample iterator - BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( - d_wrapped_samples + block_offset, - reinterpret_cast(samples)); - } - - // Load partially-full, aligned tile using the pixel iterator - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; - - WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); - - int valid_pixels = valid_samples / NUM_CHANNELS; - - // Load using a wrapped pixel iterator - BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( - d_wrapped_pixels, - reinterpret_cast(samples), - valid_pixels); - } - - // Load partially-full, mis-aligned tile using sample iterator - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; - - BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( - d_wrapped_samples + block_offset, - reinterpret_cast(samples), - valid_samples); - } - - - //--------------------------------------------------------------------- - // Tile processing - //--------------------------------------------------------------------- - - // Consume a tile of data samples - template < - bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel) - bool IS_FULL_TILE> // Whether the tile is full - __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples) - { - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS]; - bool is_valid[PIXELS_PER_THREAD]; - - // Load tile - LoadTile( - block_offset, - valid_samples, - samples, - Int2Type(), - Int2Type()); - - // Set valid flags - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) - is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples); - - // Accumulate samples -#if CUB_PTX_ARCH >= 120 - if (prefer_smem) - AccumulateSmemPixels(samples, is_valid); - else - AccumulateGmemPixels(samples, is_valid); -#else - AccumulateGmemPixels(samples, is_valid); -#endif - - } - - - // Consume row tiles. Specialized for work-stealing from queue - template - __device__ __forceinline__ void ConsumeTiles( - OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< The number of rows in the region of interest - OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest - int tiles_per_row, ///< Number of image tiles per row - GridQueue tile_queue, - Int2Type is_work_stealing) - { - - int num_tiles = num_rows * tiles_per_row; - int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x; - OffsetT num_even_share_tiles = gridDim.x * gridDim.y; - - while (tile_idx < num_tiles) - { - int row = tile_idx / tiles_per_row; - int col = tile_idx - (row * tiles_per_row); - OffsetT row_offset = row * row_stride_samples; - OffsetT col_offset = (col * TILE_SAMPLES); - OffsetT tile_offset = row_offset + col_offset; - - if (col == tiles_per_row - 1) - { - // Consume a partially-full tile at the end of the row - OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset; - ConsumeTile(tile_offset, num_remaining); - } - else - { - // Consume full tile - ConsumeTile(tile_offset, TILE_SAMPLES); - } - - CTA_SYNC(); - - // Get next tile - if (threadIdx.x == 0) - temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles; - - CTA_SYNC(); - - tile_idx = temp_storage.tile_idx; - } - } - - - // Consume row tiles. Specialized for even-share (striped across thread blocks) - template - __device__ __forceinline__ void ConsumeTiles( - OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< The number of rows in the region of interest - OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest - int tiles_per_row, ///< Number of image tiles per row - GridQueue tile_queue, - Int2Type is_work_stealing) - { - for (int row = blockIdx.y; row < num_rows; row += gridDim.y) - { - OffsetT row_begin = row * row_stride_samples; - OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS); - OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES); - - while (tile_offset < row_end) - { - OffsetT num_remaining = row_end - tile_offset; - - if (num_remaining < TILE_SAMPLES) - { - // Consume partial tile - ConsumeTile(tile_offset, num_remaining); - break; - } - - // Consume full tile - ConsumeTile(tile_offset, TILE_SAMPLES); - tile_offset += gridDim.x * TILE_SAMPLES; - } - } - } - - - //--------------------------------------------------------------------- - // Parameter extraction - //--------------------------------------------------------------------- - - // Return a native pixel pointer (specialized for CacheModifiedInputIterator types) - template < - CacheLoadModifier _MODIFIER, - typename _ValueT, - typename _OffsetT> - __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr) - { - return itr.ptr; - } - - // Return a native pixel pointer (specialized for other types) - template - __device__ __forceinline__ SampleT* NativePointer(IteratorT itr) - { - return NULL; - } - - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - - /** - * Constructor - */ - __device__ __forceinline__ AgentHistogram( - TempStorage &temp_storage, ///< Reference to temp_storage - SampleIteratorT d_samples, ///< Input data to reduce - int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram - int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per privatized histogram - CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to final output histograms - CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to privatized histograms - OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel - PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) ///< The transform operator for determining privatized counter indices from samples, one for each channel - : - temp_storage(temp_storage.Alias()), - d_wrapped_samples(d_samples), - num_output_bins(num_output_bins), - num_privatized_bins(num_privatized_bins), - d_output_histograms(d_output_histograms), - privatized_decode_op(privatized_decode_op), - output_decode_op(output_decode_op), - d_native_samples(NativePointer(d_wrapped_samples)), - prefer_smem((MEM_PREFERENCE == SMEM) ? - true : // prefer smem privatized histograms - (MEM_PREFERENCE == GMEM) ? - false : // prefer gmem privatized histograms - blockIdx.x & 1) // prefer blended privatized histograms - { - int blockId = (blockIdx.y * gridDim.x) + blockIdx.x; - - // Initialize the locations of this block's privatized histograms - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]); - } - - - /** - * Consume image - */ - __device__ __forceinline__ void ConsumeTiles( - OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< The number of rows in the region of interest - OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest - int tiles_per_row, ///< Number of image tiles per row - GridQueue tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks - { - // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel) - int quad_mask = AlignBytes::ALIGN_BYTES - 1; - int pixel_mask = AlignBytes::ALIGN_BYTES - 1; - size_t row_bytes = sizeof(SampleT) * row_stride_samples; - - bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel - ((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned - ((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad - - bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel - ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned - ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel - - // Whether rows are aligned and can be vectorized - if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows)) - ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); - else - ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); - } - - - /** - * Initialize privatized bin counters. Specialized for privatized shared-memory counters - */ - __device__ __forceinline__ void InitBinCounters() - { - if (prefer_smem) - InitSmemBinCounters(); - else - InitGmemBinCounters(); - } - - - /** - * Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters - */ - __device__ __forceinline__ void StoreOutput() - { - if (prefer_smem) - StoreSmemOutput(); - else - StoreGmemOutput(); - } - - -}; - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_downsweep.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_downsweep.cuh deleted file mode 100644 index 0eee5f4..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_downsweep.cuh +++ /dev/null @@ -1,772 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . - */ - - -#pragma once - -#include - -#include "../thread/thread_load.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_radix_rank.cuh" -#include "../block/block_exchange.cuh" -#include "../util_type.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Radix ranking algorithm - */ -enum RadixRankAlgorithm -{ - RADIX_RANK_BASIC, - RADIX_RANK_MEMOIZE, - RADIX_RANK_MATCH -}; - -/** - * Parameterizable tuning policy type for AgentRadixSortDownsweep - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values) - RadixRankAlgorithm _RANK_ALGORITHM, ///< The radix ranking algorithm to use - BlockScanAlgorithm _SCAN_ALGORITHM, ///< The block scan algorithm to use - int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) -struct AgentRadixSortDownsweepPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values) - static const RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; ///< The radix ranking algorithm to use - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - - - - - -/** - * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . - */ -template < - typename AgentRadixSortDownsweepPolicy, ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< KeyT type - typename ValueT, ///< ValueT type - typename OffsetT> ///< Signed integer type for global offsets -struct AgentRadixSortDownsweep -{ - //--------------------------------------------------------------------- - // Type definitions and constants - //--------------------------------------------------------------------- - - // Appropriate unsigned-bits representation of KeyT - typedef typename Traits::UnsignedBits UnsignedBits; - - static const UnsignedBits LOWEST_KEY = Traits::LOWEST_KEY; - static const UnsignedBits MAX_KEY = Traits::MAX_KEY; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM; - static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER; - static const RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM; - static const BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM; - - enum - { - BLOCK_THREADS = AgentRadixSortDownsweepPolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD, - RADIX_BITS = AgentRadixSortDownsweepPolicy::RADIX_BITS, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - RADIX_DIGITS = 1 << RADIX_BITS, - KEYS_ONLY = Equals::VALUE, - }; - - // Input iterator wrapper type (for applying cache modifier)s - typedef CacheModifiedInputIterator KeysItr; - typedef CacheModifiedInputIterator ValuesItr; - - // Radix ranking type to use - typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC), - BlockRadixRank, - typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE), - BlockRadixRank, - BlockRadixRankMatch - >::Type - >::Type BlockRadixRankT; - - enum - { - /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD - }; - - // BlockLoad type (keys) - typedef BlockLoad< - UnsignedBits, - BLOCK_THREADS, - ITEMS_PER_THREAD, - LOAD_ALGORITHM> BlockLoadKeysT; - - // BlockLoad type (values) - typedef BlockLoad< - ValueT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - LOAD_ALGORITHM> BlockLoadValuesT; - - // Value exchange array type - typedef ValueT ValueExchangeT[TILE_ITEMS]; - - /** - * Shared memory storage layout - */ - union __align__(16) _TempStorage - { - typename BlockLoadKeysT::TempStorage load_keys; - typename BlockLoadValuesT::TempStorage load_values; - typename BlockRadixRankT::TempStorage radix_rank; - - struct - { - UnsignedBits exchange_keys[TILE_ITEMS]; - OffsetT relative_bin_offsets[RADIX_DIGITS]; - }; - - Uninitialized exchange_values; - - OffsetT exclusive_digit_prefix[RADIX_DIGITS]; - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Thread fields - //--------------------------------------------------------------------- - - // Shared storage for this CTA - _TempStorage &temp_storage; - - // Input and output device pointers - KeysItr d_keys_in; - ValuesItr d_values_in; - UnsignedBits *d_keys_out; - ValueT *d_values_out; - - // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) - OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; - - // The least-significant bit position of the current digit to extract - int current_bit; - - // Number of bits in current digit - int num_bits; - - // Whether to short-cirucit - int short_circuit; - - //--------------------------------------------------------------------- - // Utility methods - //--------------------------------------------------------------------- - - - /** - * Scatter ranked keys through shared memory, then to device-accessible memory - */ - template - __device__ __forceinline__ void ScatterKeys( - UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], - OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - OffsetT valid_items) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - UnsignedBits key = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)]; - UnsignedBits digit = BFE(key, current_bit, num_bits); - relative_bin_offsets[ITEM] = temp_storage.relative_bin_offsets[digit]; - - // Un-twiddle - key = Traits::TwiddleOut(key); - - if (FULL_TILE || - (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) - { - d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key; - } - } - } - - - /** - * Scatter ranked values through shared memory, then to device-accessible memory - */ - template - __device__ __forceinline__ void ScatterValues( - ValueT (&values)[ITEMS_PER_THREAD], - OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - OffsetT valid_items) - { - CTA_SYNC(); - - ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - exchange_values[ranks[ITEM]] = values[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)]; - - if (FULL_TILE || - (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) - { - d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value; - } - } - } - - /** - * Load a tile of keys (specialized for full tile, any ranking algorithm) - */ - template - __device__ __forceinline__ void LoadKeys( - UnsignedBits (&keys)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - UnsignedBits oob_item, - Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) - { - BlockLoadKeysT(temp_storage.load_keys).Load( - d_keys_in + block_offset, keys); - - CTA_SYNC(); - } - - - /** - * Load a tile of keys (specialized for partial tile, any ranking algorithm) - */ - template - __device__ __forceinline__ void LoadKeys( - UnsignedBits (&keys)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - UnsignedBits oob_item, - Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) - { - BlockLoadKeysT(temp_storage.load_keys).Load( - d_keys_in + block_offset, keys, valid_items, oob_item); - - CTA_SYNC(); - } - - - /** - * Load a tile of keys (specialized for full tile, match ranking algorithm) - */ - __device__ __forceinline__ void LoadKeys( - UnsignedBits (&keys)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - UnsignedBits oob_item, - Int2Type is_full_tile, - Int2Type rank_algorithm) - { - LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys); - } - - - /** - * Load a tile of keys (specialized for partial tile, match ranking algorithm) - */ - __device__ __forceinline__ void LoadKeys( - UnsignedBits (&keys)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - UnsignedBits oob_item, - Int2Type is_full_tile, - Int2Type rank_algorithm) - { - LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item); - } - - - /** - * Load a tile of values (specialized for full tile, any ranking algorithm) - */ - template - __device__ __forceinline__ void LoadValues( - ValueT (&values)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) - { - BlockLoadValuesT(temp_storage.load_values).Load( - d_values_in + block_offset, values); - - CTA_SYNC(); - } - - - /** - * Load a tile of values (specialized for partial tile, any ranking algorithm) - */ - template - __device__ __forceinline__ void LoadValues( - ValueT (&values)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) - { - BlockLoadValuesT(temp_storage.load_values).Load( - d_values_in + block_offset, values, valid_items); - - CTA_SYNC(); - } - - - /** - * Load a tile of items (specialized for full tile, match ranking algorithm) - */ - __device__ __forceinline__ void LoadValues( - ValueT (&values)[ITEMS_PER_THREAD], - OffsetT block_offset, - volatile OffsetT valid_items, - Int2Type is_full_tile, - Int2Type rank_algorithm) - { - LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values); - } - - - /** - * Load a tile of items (specialized for partial tile, match ranking algorithm) - */ - __device__ __forceinline__ void LoadValues( - ValueT (&values)[ITEMS_PER_THREAD], - OffsetT block_offset, - volatile OffsetT valid_items, - Int2Type is_full_tile, - Int2Type rank_algorithm) - { - LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items); - } - - - /** - * Truck along associated values - */ - template - __device__ __forceinline__ void GatherScatterValues( - OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - Int2Type /*is_keys_only*/) - { - CTA_SYNC(); - - ValueT values[ITEMS_PER_THREAD]; - - LoadValues( - values, - block_offset, - valid_items, - Int2Type(), - Int2Type()); - - ScatterValues( - values, - relative_bin_offsets, - ranks, - valid_items); - } - - - /** - * Truck along associated values (specialized for key-only sorting) - */ - template - __device__ __forceinline__ void GatherScatterValues( - OffsetT (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD], - int (&/*ranks*/)[ITEMS_PER_THREAD], - OffsetT /*block_offset*/, - OffsetT /*valid_items*/, - Int2Type /*is_keys_only*/) - {} - - - /** - * Process tile - */ - template - __device__ __forceinline__ void ProcessTile( - OffsetT block_offset, - const OffsetT &valid_items = TILE_ITEMS) - { - UnsignedBits keys[ITEMS_PER_THREAD]; - int ranks[ITEMS_PER_THREAD]; - OffsetT relative_bin_offsets[ITEMS_PER_THREAD]; - - // Assign default (min/max) value to all keys - UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY; - - // Load tile of keys - LoadKeys( - keys, - block_offset, - valid_items, - default_key, - Int2Type(), - Int2Type()); - - // Twiddle key bits if necessary - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - keys[KEY] = Traits::TwiddleIn(keys[KEY]); - } - - // Rank the twiddled keys - int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; - BlockRadixRankT(temp_storage.radix_rank).RankKeys( - keys, - ranks, - current_bit, - num_bits, - exclusive_digit_prefix); - - CTA_SYNC(); - - // Share exclusive digit prefix - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - // Store exclusive prefix - temp_storage.exclusive_digit_prefix[bin_idx] = - exclusive_digit_prefix[track]; - } - } - - CTA_SYNC(); - - // Get inclusive digit prefix - int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; - - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - if (IS_DESCENDING) - { - // Get inclusive digit prefix from exclusive prefix (higher bins come first) - inclusive_digit_prefix[track] = (bin_idx == 0) ? - (BLOCK_THREADS * ITEMS_PER_THREAD) : - temp_storage.exclusive_digit_prefix[bin_idx - 1]; - } - else - { - // Get inclusive digit prefix from exclusive prefix (lower bins come first) - inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ? - (BLOCK_THREADS * ITEMS_PER_THREAD) : - temp_storage.exclusive_digit_prefix[bin_idx + 1]; - } - } - } - - CTA_SYNC(); - - // Update global scatter base offsets for each digit - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - bin_offset[track] -= exclusive_digit_prefix[track]; - temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track]; - bin_offset[track] += inclusive_digit_prefix[track]; - } - } - - CTA_SYNC(); - - // Scatter keys - ScatterKeys(keys, relative_bin_offsets, ranks, valid_items); - - // Gather/scatter values - GatherScatterValues(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type()); - } - - //--------------------------------------------------------------------- - // Copy shortcut - //--------------------------------------------------------------------- - - /** - * Copy tiles within the range of input - */ - template < - typename InputIteratorT, - typename T> - __device__ __forceinline__ void Copy( - InputIteratorT d_in, - T *d_out, - OffsetT block_offset, - OffsetT block_end) - { - // Simply copy the input - while (block_offset + TILE_ITEMS <= block_end) - { - T items[ITEMS_PER_THREAD]; - - LoadDirectStriped(threadIdx.x, d_in + block_offset, items); - CTA_SYNC(); - StoreDirectStriped(threadIdx.x, d_out + block_offset, items); - - block_offset += TILE_ITEMS; - } - - // Clean up last partial tile with guarded-I/O - if (block_offset < block_end) - { - OffsetT valid_items = block_end - block_offset; - - T items[ITEMS_PER_THREAD]; - - LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items); - CTA_SYNC(); - StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items); - } - } - - - /** - * Copy tiles within the range of input (specialized for NullType) - */ - template - __device__ __forceinline__ void Copy( - InputIteratorT /*d_in*/, - NullType * /*d_out*/, - OffsetT /*block_offset*/, - OffsetT /*block_end*/) - {} - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ AgentRadixSortDownsweep( - TempStorage &temp_storage, - OffsetT (&bin_offset)[BINS_TRACKED_PER_THREAD], - OffsetT num_items, - const KeyT *d_keys_in, - KeyT *d_keys_out, - const ValueT *d_values_in, - ValueT *d_values_out, - int current_bit, - int num_bits) - : - temp_storage(temp_storage.Alias()), - d_keys_in(reinterpret_cast(d_keys_in)), - d_values_in(d_values_in), - d_keys_out(reinterpret_cast(d_keys_out)), - d_values_out(d_values_out), - current_bit(current_bit), - num_bits(num_bits), - short_circuit(1) - { - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - this->bin_offset[track] = bin_offset[track]; - - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - // Short circuit if the histogram has only bin counts of only zeros or problem-size - short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items)); - } - } - - short_circuit = CTA_SYNC_AND(short_circuit); - } - - - /** - * Constructor - */ - __device__ __forceinline__ AgentRadixSortDownsweep( - TempStorage &temp_storage, - OffsetT num_items, - OffsetT *d_spine, - const KeyT *d_keys_in, - KeyT *d_keys_out, - const ValueT *d_values_in, - ValueT *d_values_out, - int current_bit, - int num_bits) - : - temp_storage(temp_storage.Alias()), - d_keys_in(reinterpret_cast(d_keys_in)), - d_values_in(d_values_in), - d_keys_out(reinterpret_cast(d_keys_out)), - d_values_out(d_values_out), - current_bit(current_bit), - num_bits(num_bits), - short_circuit(1) - { - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit) - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - if (IS_DESCENDING) - bin_idx = RADIX_DIGITS - bin_idx - 1; - - // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size - OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx]; - short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items)); - - // Load my block's bin offset for my bin - bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x]; - } - } - - short_circuit = CTA_SYNC_AND(short_circuit); - } - - - /** - * Distribute keys from a segment of input tiles. - */ - __device__ __forceinline__ void ProcessRegion( - OffsetT block_offset, - OffsetT block_end) - { - if (short_circuit) - { - // Copy keys - Copy(d_keys_in, d_keys_out, block_offset, block_end); - - // Copy values - Copy(d_values_in, d_values_out, block_offset, block_end); - } - else - { - // Process full tiles of tile_items - while (block_offset + TILE_ITEMS <= block_end) - { - ProcessTile(block_offset); - block_offset += TILE_ITEMS; - - CTA_SYNC(); - } - - // Clean up last partial tile with guarded-I/O - if (block_offset < block_end) - { - ProcessTile(block_offset, block_end - block_offset); - } - - } - } - -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_upsweep.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_upsweep.cuh deleted file mode 100644 index 803fadf..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_radix_sort_upsweep.cuh +++ /dev/null @@ -1,526 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . - */ - -#pragma once - -#include "../thread/thread_reduce.cuh" -#include "../thread/thread_load.cuh" -#include "../warp/warp_reduce.cuh" -#include "../block/block_load.cuh" -#include "../util_type.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentRadixSortUpsweep - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys - int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) -struct AgentRadixSortUpsweepPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) - }; - - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . - */ -template < - typename AgentRadixSortUpsweepPolicy, ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type - typename KeyT, ///< KeyT type - typename OffsetT> ///< Signed integer type for global offsets -struct AgentRadixSortUpsweep -{ - - //--------------------------------------------------------------------- - // Type definitions and constants - //--------------------------------------------------------------------- - - typedef typename Traits::UnsignedBits UnsignedBits; - - // Integer type for digit counters (to be packed into words of PackedCounters) - typedef unsigned char DigitCounter; - - // Integer type for packing DigitCounters into columns of shared memory banks - typedef unsigned int PackedCounter; - - static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER; - - enum - { - RADIX_BITS = AgentRadixSortUpsweepPolicy::RADIX_BITS, - BLOCK_THREADS = AgentRadixSortUpsweepPolicy::BLOCK_THREADS, - KEYS_PER_THREAD = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD, - - RADIX_DIGITS = 1 << RADIX_BITS, - - LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD, - - BYTES_PER_COUNTER = sizeof(DigitCounter), - LOG_BYTES_PER_COUNTER = Log2::VALUE, - - PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), - LOG_PACKING_RATIO = Log2::VALUE, - - LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO), - COUNTER_LANES = 1 << LOG_COUNTER_LANES, - - // To prevent counter overflow, we must periodically unpack and aggregate the - // digit counters back into registers. Each counter lane is assigned to a - // warp for aggregation. - - LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS), - - // Unroll tiles in batches without risk of counter overflow - UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD), - UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS, - }; - - - // Input iterator wrapper type (for applying cache modifier)s - typedef CacheModifiedInputIterator KeysItr; - - /** - * Shared memory storage layout - */ - union __align__(16) _TempStorage - { - DigitCounter thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; - PackedCounter packed_thread_counters[COUNTER_LANES][BLOCK_THREADS]; - OffsetT block_counters[WARP_THREADS][RADIX_DIGITS]; - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Thread fields (aggregate state bundle) - //--------------------------------------------------------------------- - - // Shared storage for this CTA - _TempStorage &temp_storage; - - // Thread-local counters for periodically aggregating composite-counter lanes - OffsetT local_counts[LANES_PER_WARP][PACKING_RATIO]; - - // Input and output device pointers - KeysItr d_keys_in; - - // The least-significant bit position of the current digit to extract - int current_bit; - - // Number of bits in current digit - int num_bits; - - - - //--------------------------------------------------------------------- - // Helper structure for templated iteration - //--------------------------------------------------------------------- - - // Iterate - template - struct Iterate - { - // BucketKeys - static __device__ __forceinline__ void BucketKeys( - AgentRadixSortUpsweep &cta, - UnsignedBits keys[KEYS_PER_THREAD]) - { - cta.Bucket(keys[COUNT]); - - // Next - Iterate::BucketKeys(cta, keys); - } - }; - - // Terminate - template - struct Iterate - { - // BucketKeys - static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {} - }; - - - //--------------------------------------------------------------------- - // Utility methods - //--------------------------------------------------------------------- - - /** - * Decode a key and increment corresponding smem digit counter - */ - __device__ __forceinline__ void Bucket(UnsignedBits key) - { - // Perform transform op - UnsignedBits converted_key = Traits::TwiddleIn(key); - - // Extract current digit bits - UnsignedBits digit = BFE(converted_key, current_bit, num_bits); - - // Get sub-counter offset - UnsignedBits sub_counter = digit & (PACKING_RATIO - 1); - - // Get row offset - UnsignedBits row_offset = digit >> LOG_PACKING_RATIO; - - // Increment counter - temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++; - } - - - /** - * Reset composite counters - */ - __device__ __forceinline__ void ResetDigitCounters() - { - #pragma unroll - for (int LANE = 0; LANE < COUNTER_LANES; LANE++) - { - temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0; - } - } - - - /** - * Reset the unpacked counters in each thread - */ - __device__ __forceinline__ void ResetUnpackedCounters() - { - #pragma unroll - for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) - { - #pragma unroll - for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) - { - local_counts[LANE][UNPACKED_COUNTER] = 0; - } - } - } - - - /** - * Extracts and aggregates the digit counters for each counter lane - * owned by this warp - */ - __device__ __forceinline__ void UnpackDigitCounts() - { - unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); - - #pragma unroll - for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) - { - const int counter_lane = (LANE * WARPS) + warp_id; - if (counter_lane < COUNTER_LANES) - { - #pragma unroll - for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS) - { - #pragma unroll - for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) - { - OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER]; - local_counts[LANE][UNPACKED_COUNTER] += counter; - } - } - } - } - } - - - /** - * Processes a single, full tile - */ - __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset) - { - // Tile of keys - UnsignedBits keys[KEYS_PER_THREAD]; - - LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys); - - // Prevent hoisting - CTA_SYNC(); - - // Bucket tile of keys - Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys); - } - - - /** - * Processes a single load (may have some threads masked off) - */ - __device__ __forceinline__ void ProcessPartialTile( - OffsetT block_offset, - const OffsetT &block_end) - { - // Process partial tile if necessary using single loads - block_offset += threadIdx.x; - while (block_offset < block_end) - { - // Load and bucket key - UnsignedBits key = d_keys_in[block_offset]; - Bucket(key); - block_offset += BLOCK_THREADS; - } - } - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ AgentRadixSortUpsweep( - TempStorage &temp_storage, - const KeyT *d_keys_in, - int current_bit, - int num_bits) - : - temp_storage(temp_storage.Alias()), - d_keys_in(reinterpret_cast(d_keys_in)), - current_bit(current_bit), - num_bits(num_bits) - {} - - - /** - * Compute radix digit histograms from a segment of input tiles. - */ - __device__ __forceinline__ void ProcessRegion( - OffsetT block_offset, - const OffsetT &block_end) - { - // Reset digit counters in smem and unpacked counters in registers - ResetDigitCounters(); - ResetUnpackedCounters(); - - // Unroll batches of full tiles - while (block_offset + UNROLLED_ELEMENTS <= block_end) - { - for (int i = 0; i < UNROLL_COUNT; ++i) - { - ProcessFullTile(block_offset); - block_offset += TILE_ITEMS; - } - - CTA_SYNC(); - - // Aggregate back into local_count registers to prevent overflow - UnpackDigitCounts(); - - CTA_SYNC(); - - // Reset composite counters in lanes - ResetDigitCounters(); - } - - // Unroll single full tiles - while (block_offset + TILE_ITEMS <= block_end) - { - ProcessFullTile(block_offset); - block_offset += TILE_ITEMS; - } - - // Process partial tile if necessary - ProcessPartialTile( - block_offset, - block_end); - - CTA_SYNC(); - - // Aggregate back into local_count registers - UnpackDigitCounts(); - } - - - /** - * Extract counts (saving them to the external array) - */ - template - __device__ __forceinline__ void ExtractCounts( - OffsetT *counters, - int bin_stride = 1, - int bin_offset = 0) - { - unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); - - // Place unpacked digit counters in shared memory - #pragma unroll - for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) - { - int counter_lane = (LANE * WARPS) + warp_id; - if (counter_lane < COUNTER_LANES) - { - int digit_row = counter_lane << LOG_PACKING_RATIO; - - #pragma unroll - for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) - { - int bin_idx = digit_row + UNPACKED_COUNTER; - - temp_storage.block_counters[warp_tid][bin_idx] = - local_counts[LANE][UNPACKED_COUNTER]; - } - } - } - - CTA_SYNC(); - - // Rake-reduce bin_count reductions - - // Whole blocks - #pragma unroll - for (int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS; - (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS; - BIN_BASE += BLOCK_THREADS) - { - int bin_idx = BIN_BASE + threadIdx.x; - - OffsetT bin_count = 0; - #pragma unroll - for (int i = 0; i < WARP_THREADS; ++i) - bin_count += temp_storage.block_counters[i][bin_idx]; - - if (IS_DESCENDING) - bin_idx = RADIX_DIGITS - bin_idx - 1; - - counters[(bin_stride * bin_idx) + bin_offset] = bin_count; - } - - // Remainder - if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS)) - { - int bin_idx = threadIdx.x; - - OffsetT bin_count = 0; - #pragma unroll - for (int i = 0; i < WARP_THREADS; ++i) - bin_count += temp_storage.block_counters[i][bin_idx]; - - if (IS_DESCENDING) - bin_idx = RADIX_DIGITS - bin_idx - 1; - - counters[(bin_stride * bin_idx) + bin_offset] = bin_count; - } - } - - - /** - * Extract counts - */ - template - __device__ __forceinline__ void ExtractCounts( - OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] - { - unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); - - // Place unpacked digit counters in shared memory - #pragma unroll - for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) - { - int counter_lane = (LANE * WARPS) + warp_id; - if (counter_lane < COUNTER_LANES) - { - int digit_row = counter_lane << LOG_PACKING_RATIO; - - #pragma unroll - for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) - { - int bin_idx = digit_row + UNPACKED_COUNTER; - - temp_storage.block_counters[warp_tid][bin_idx] = - local_counts[LANE][UNPACKED_COUNTER]; - } - } - } - - CTA_SYNC(); - - // Rake-reduce bin_count reductions - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - bin_count[track] = 0; - - #pragma unroll - for (int i = 0; i < WARP_THREADS; ++i) - bin_count[track] += temp_storage.block_counters[i][bin_idx]; - } - } - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce.cuh deleted file mode 100644 index 5528d8b..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce.cuh +++ /dev/null @@ -1,385 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . - */ - -#pragma once - -#include - -#include "../block/block_load.cuh" -#include "../block/block_reduce.cuh" -#include "../grid/grid_mapping.cuh" -#include "../grid/grid_even_share.cuh" -#include "../util_type.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentReduce - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load - BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use - CacheLoadModifier _LOAD_MODIFIER> ///< Cache load modifier for reading input elements -struct AgentReducePolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load - }; - - static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements -}; - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . - * - * Each thread reduces only the values it loads. If \p FIRST_TILE, this - * partial reduction is stored into \p thread_aggregate. Otherwise it is - * accumulated into \p thread_aggregate. - */ -template < - typename AgentReducePolicy, ///< Parameterized AgentReducePolicy tuning policy type - typename InputIteratorT, ///< Random-access iterator type for input - typename OutputIteratorT, ///< Random-access iterator type for output - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b) -struct AgentReduce -{ - - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// The input value type - typedef typename std::iterator_traits::value_type InputT; - - /// The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - /// Vector type of InputT for data movement - typedef typename CubVector::Type VectorT; - - /// Input iterator wrapper type (for applying cache modifier) - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - InputIteratorT>::Type // Directly use the supplied input iterator type - WrappedInputIteratorT; - - /// Constants - enum - { - BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD, - VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH), - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type - ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) && - (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && - (IsPointer::VALUE) && Traits::PRIMITIVE, - - }; - - static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER; - static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM; - - /// Parameterized BlockReduce primitive - typedef BlockReduce BlockReduceT; - - /// Shared memory type required by this thread block - struct _TempStorage - { - typename BlockReduceT::TempStorage reduce; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - InputIteratorT d_in; ///< Input data to reduce - WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce - ReductionOp reduction_op; ///< Binary reduction operator - - - //--------------------------------------------------------------------- - // Utility - //--------------------------------------------------------------------- - - - // Whether or not the input is aligned with the vector type (specialized for types we can vectorize) - template - static __device__ __forceinline__ bool IsAligned( - Iterator d_in, - Int2Type /*can_vectorize*/) - { - return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; - } - - // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize) - template - static __device__ __forceinline__ bool IsAligned( - Iterator /*d_in*/, - Int2Type /*can_vectorize*/) - { - return false; - } - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ AgentReduce( - TempStorage& temp_storage, ///< Reference to temp_storage - InputIteratorT d_in, ///< Input data to reduce - ReductionOp reduction_op) ///< Binary reduction operator - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_wrapped_in(d_in), - reduction_op(reduction_op) - {} - - - //--------------------------------------------------------------------- - // Tile consumption - //--------------------------------------------------------------------- - - /** - * Consume a full tile of input (non-vectorized) - */ - template - __device__ __forceinline__ void ConsumeTile( - OutputT &thread_aggregate, - OffsetT block_offset, ///< The offset the tile to consume - int /*valid_items*/, ///< The number of valid items in the tile - Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile - Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads - { - OutputT items[ITEMS_PER_THREAD]; - - // Load items in striped fashion - LoadDirectStriped(threadIdx.x, d_wrapped_in + block_offset, items); - - // Reduce items within each thread stripe - thread_aggregate = (IS_FIRST_TILE) ? - internal::ThreadReduce(items, reduction_op) : - internal::ThreadReduce(items, reduction_op, thread_aggregate); - } - - - /** - * Consume a full tile of input (vectorized) - */ - template - __device__ __forceinline__ void ConsumeTile( - OutputT &thread_aggregate, - OffsetT block_offset, ///< The offset the tile to consume - int /*valid_items*/, ///< The number of valid items in the tile - Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile - Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads - { - // Alias items as an array of VectorT and load it in striped fashion - enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; - - // Fabricate a vectorized input iterator - InputT *d_in_unqualified = const_cast(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); - CacheModifiedInputIterator d_vec_in( - reinterpret_cast(d_in_unqualified)); - - // Load items as vector items - InputT input_items[ITEMS_PER_THREAD]; - VectorT *vec_items = reinterpret_cast(input_items); - #pragma unroll - for (int i = 0; i < WORDS; ++i) - vec_items[i] = d_vec_in[BLOCK_THREADS * i]; - - // Convert from input type to output type - OutputT items[ITEMS_PER_THREAD]; - #pragma unroll - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - items[i] = input_items[i]; - - // Reduce items within each thread stripe - thread_aggregate = (IS_FIRST_TILE) ? - internal::ThreadReduce(items, reduction_op) : - internal::ThreadReduce(items, reduction_op, thread_aggregate); - } - - - /** - * Consume a partial tile of input - */ - template - __device__ __forceinline__ void ConsumeTile( - OutputT &thread_aggregate, - OffsetT block_offset, ///< The offset the tile to consume - int valid_items, ///< The number of valid items in the tile - Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile - Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads - { - // Partial tile - int thread_offset = threadIdx.x; - - // Read first item - if ((IS_FIRST_TILE) && (thread_offset < valid_items)) - { - thread_aggregate = d_wrapped_in[block_offset + thread_offset]; - thread_offset += BLOCK_THREADS; - } - - // Continue reading items (block-striped) - while (thread_offset < valid_items) - { - OutputT item = d_wrapped_in[block_offset + thread_offset]; - thread_aggregate = reduction_op(thread_aggregate, item); - thread_offset += BLOCK_THREADS; - } - } - - - //--------------------------------------------------------------- - // Consume a contiguous segment of tiles - //--------------------------------------------------------------------- - - /** - * \brief Reduce a contiguous segment of input tiles - */ - template - __device__ __forceinline__ OutputT ConsumeRange( - GridEvenShare &even_share, ///< GridEvenShare descriptor - Int2Type can_vectorize) ///< Whether or not we can vectorize loads - { - OutputT thread_aggregate; - - if (even_share.block_offset + TILE_ITEMS > even_share.block_end) - { - // First tile isn't full (not all threads have valid items) - int valid_items = even_share.block_end - even_share.block_offset; - ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); - return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items); - } - - // At least one full block - ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); - even_share.block_offset += even_share.block_stride; - - // Consume subsequent full tiles of input - while (even_share.block_offset + TILE_ITEMS <= even_share.block_end) - { - ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); - even_share.block_offset += even_share.block_stride; - } - - // Consume a partially-full tile - if (even_share.block_offset < even_share.block_end) - { - int valid_items = even_share.block_end - even_share.block_offset; - ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); - } - - // Compute block-wide reduction (all threads have valid items) - return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op); - } - - - /** - * \brief Reduce a contiguous segment of input tiles - */ - __device__ __forceinline__ OutputT ConsumeRange( - OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) - OffsetT block_end) ///< [in] Threadblock end offset (exclusive) - { - GridEvenShare even_share; - even_share.template BlockInit(block_offset, block_end); - - return (IsAligned(d_in + block_offset, Int2Type())) ? - ConsumeRange(even_share, Int2Type()) : - ConsumeRange(even_share, Int2Type()); - } - - - /** - * Reduce a contiguous segment of input tiles - */ - __device__ __forceinline__ OutputT ConsumeTiles( - GridEvenShare &even_share) ///< [in] GridEvenShare descriptor - { - // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block - even_share.template BlockInit(); - - return (IsAligned(d_in, Int2Type())) ? - ConsumeRange(even_share, Int2Type()) : - ConsumeRange(even_share, Int2Type()); - - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce_by_key.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce_by_key.cuh deleted file mode 100644 index a57d60e..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_reduce_by_key.cuh +++ /dev/null @@ -1,549 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. - */ - -#pragma once - -#include - -#include "single_pass_scan_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_discontinuity.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../iterator/constant_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentReduceByKey - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentReduceByKeyPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key - */ -template < - typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicy tuning policy type - typename KeysInputIteratorT, ///< Random-access input iterator type for keys - typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys - typename ValuesInputIteratorT, ///< Random-access input iterator type for values - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of items selected - typename EqualityOpT, ///< KeyT equality operator type - typename ReductionOpT, ///< ValueT reduction operator type - typename OffsetT> ///< Signed integer type for global offsets -struct AgentReduceByKey -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // The input keys type - typedef typename std::iterator_traits::value_type KeyInputT; - - // The output keys type - typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type - - // The input values type - typedef typename std::iterator_traits::value_type ValueInputT; - - // The output values type - typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type - - // Tuple type for scanning (pairs accumulated segment-value with segment-index) - typedef KeyValuePair OffsetValuePairT; - - // Tuple type for pairing keys and values - typedef KeyValuePair KeyValuePairT; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - // Guarded inequality functor - template - struct GuardedInequalityWrapper - { - _EqualityOpT op; ///< Wrapped equality operator - int num_remaining; ///< Items remaining - - /// Constructor - __host__ __device__ __forceinline__ - GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {} - - /// Boolean inequality operator, returns (a != b) - template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const - { - if (idx < num_remaining) - return !op(a, b); // In bounds - - // Return true if first out-of-bounds item, false otherwise - return (idx == num_remaining); - } - }; - - - // Constants - enum - { - BLOCK_THREADS = AgentReduceByKeyPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), - - // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) - HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), - }; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - KeysInputIteratorT>::Type // Directly use the supplied input iterator type - WrappedKeysInputIteratorT; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for values - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - ValuesInputIteratorT>::Type // Directly use the supplied input iterator type - WrappedValuesInputIteratorT; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - AggregatesOutputIteratorT>::Type // Directly use the supplied input iterator type - WrappedFixupInputIteratorT; - - // Reduce-value-by-segment scan operator - typedef ReduceBySegmentOp ReduceBySegmentOpT; - - // Parameterized BlockLoad type for keys - typedef BlockLoad< - KeyOutputT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - AgentReduceByKeyPolicyT::LOAD_ALGORITHM> - BlockLoadKeysT; - - // Parameterized BlockLoad type for values - typedef BlockLoad< - ValueOutputT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - AgentReduceByKeyPolicyT::LOAD_ALGORITHM> - BlockLoadValuesT; - - // Parameterized BlockDiscontinuity type for keys - typedef BlockDiscontinuity< - KeyOutputT, - BLOCK_THREADS> - BlockDiscontinuityKeys; - - // Parameterized BlockScan type - typedef BlockScan< - OffsetValuePairT, - BLOCK_THREADS, - AgentReduceByKeyPolicyT::SCAN_ALGORITHM> - BlockScanT; - - // Callback type for obtaining tile prefix during block scan - typedef TilePrefixCallbackOp< - OffsetValuePairT, - ReduceBySegmentOpT, - ScanTileStateT> - TilePrefixCallbackOpT; - - // Key and value exchange types - typedef KeyOutputT KeyExchangeT[TILE_ITEMS + 1]; - typedef ValueOutputT ValueExchangeT[TILE_ITEMS + 1]; - - // Shared memory type for this thread block - union _TempStorage - { - struct - { - typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback - typename BlockDiscontinuityKeys::TempStorage discontinuity; // Smem needed for discontinuity detection - }; - - // Smem needed for loading keys - typename BlockLoadKeysT::TempStorage load_keys; - - // Smem needed for loading values - typename BlockLoadValuesT::TempStorage load_values; - - // Smem needed for compacting key value pairs(allows non POD items in this union) - Uninitialized raw_exchange; - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - WrappedKeysInputIteratorT d_keys_in; ///< Input keys - UniqueOutputIteratorT d_unique_out; ///< Unique output keys - WrappedValuesInputIteratorT d_values_in; ///< Input values - AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates - NumRunsOutputIteratorT d_num_runs_out; ///< Output pointer for total number of segments identified - EqualityOpT equality_op; ///< KeyT equality operator - ReductionOpT reduction_op; ///< Reduction operator - ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - AgentReduceByKey( - TempStorage& temp_storage, ///< Reference to temp_storage - KeysInputIteratorT d_keys_in, ///< Input keys - UniqueOutputIteratorT d_unique_out, ///< Unique output keys - ValuesInputIteratorT d_values_in, ///< Input values - AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates - NumRunsOutputIteratorT d_num_runs_out, ///< Output pointer for total number of segments identified - EqualityOpT equality_op, ///< KeyT equality operator - ReductionOpT reduction_op) ///< ValueT reduction operator - : - temp_storage(temp_storage.Alias()), - d_keys_in(d_keys_in), - d_unique_out(d_unique_out), - d_values_in(d_values_in), - d_aggregates_out(d_aggregates_out), - d_num_runs_out(d_num_runs_out), - equality_op(equality_op), - reduction_op(reduction_op), - scan_op(reduction_op) - {} - - - //--------------------------------------------------------------------- - // Scatter utility methods - //--------------------------------------------------------------------- - - /** - * Directly scatter flagged items to output offsets - */ - __device__ __forceinline__ void ScatterDirect( - KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], - OffsetT (&segment_flags)[ITEMS_PER_THREAD], - OffsetT (&segment_indices)[ITEMS_PER_THREAD]) - { - // Scatter flagged keys and values - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (segment_flags[ITEM]) - { - d_unique_out[segment_indices[ITEM]] = scatter_items[ITEM].key; - d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value; - } - } - } - - - /** - * 2-phase scatter flagged items to output offsets - * - * The exclusive scan causes each head flag to be paired with the previous - * value aggregate: the scatter offsets must be decremented for value aggregates - */ - __device__ __forceinline__ void ScatterTwoPhase( - KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], - OffsetT (&segment_flags)[ITEMS_PER_THREAD], - OffsetT (&segment_indices)[ITEMS_PER_THREAD], - OffsetT num_tile_segments, - OffsetT num_tile_segments_prefix) - { - CTA_SYNC(); - - // Compact and scatter pairs - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (segment_flags[ITEM]) - { - temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM]; - } - } - - CTA_SYNC(); - - for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) - { - KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item]; - d_unique_out[num_tile_segments_prefix + item] = pair.key; - d_aggregates_out[num_tile_segments_prefix + item] = pair.value; - } - } - - - /** - * Scatter flagged items - */ - __device__ __forceinline__ void Scatter( - KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], - OffsetT (&segment_flags)[ITEMS_PER_THREAD], - OffsetT (&segment_indices)[ITEMS_PER_THREAD], - OffsetT num_tile_segments, - OffsetT num_tile_segments_prefix) - { - // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one - if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS)) - { - ScatterTwoPhase( - scatter_items, - segment_flags, - segment_indices, - num_tile_segments, - num_tile_segments_prefix); - } - else - { - ScatterDirect( - scatter_items, - segment_flags, - segment_indices); - } - } - - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - /** - * Process a tile of input (dynamic chained scan) - */ - template ///< Whether the current tile is the last tile - __device__ __forceinline__ void ConsumeTile( - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - KeyOutputT keys[ITEMS_PER_THREAD]; // Tile keys - KeyOutputT prev_keys[ITEMS_PER_THREAD]; // Tile keys shuffled up - ValueOutputT values[ITEMS_PER_THREAD]; // Tile values - OffsetT head_flags[ITEMS_PER_THREAD]; // Segment head flags - OffsetT segment_indices[ITEMS_PER_THREAD]; // Segment indices - OffsetValuePairT scan_items[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices - KeyValuePairT scatter_items[ITEMS_PER_THREAD]; // Zipped key value pairs for scattering - - // Load keys - if (IS_LAST_TILE) - BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining); - else - BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); - - // Load tile predecessor key in first thread - KeyOutputT tile_predecessor; - if (threadIdx.x == 0) - { - tile_predecessor = (tile_idx == 0) ? - keys[0] : // First tile gets repeat of first item (thus first item will not be flagged as a head) - d_keys_in[tile_offset - 1]; // Subsequent tiles get last key from previous tile - } - - CTA_SYNC(); - - // Load values - if (IS_LAST_TILE) - BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining); - else - BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values); - - CTA_SYNC(); - - // Initialize head-flags and shuffle up the previous keys - if (IS_LAST_TILE) - { - // Use custom flag operator to additionally flag the first out-of-bounds item - GuardedInequalityWrapper flag_op(equality_op, num_remaining); - BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads( - head_flags, keys, prev_keys, flag_op, tile_predecessor); - } - else - { - InequalityWrapper flag_op(equality_op); - BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads( - head_flags, keys, prev_keys, flag_op, tile_predecessor); - } - - // Zip values and head flags - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - scan_items[ITEM].value = values[ITEM]; - scan_items[ITEM].key = head_flags[ITEM]; - } - - // Perform exclusive tile scan - OffsetValuePairT block_aggregate; // Inclusive block-wide scan aggregate - OffsetT num_segments_prefix; // Number of segments prior to this tile - ValueOutputT total_aggregate; // The tile prefix folded with block_aggregate - if (tile_idx == 0) - { - // Scan first tile - BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate); - num_segments_prefix = 0; - total_aggregate = block_aggregate.value; - - // Update tile status if there are successor tiles - if ((!IS_LAST_TILE) && (threadIdx.x == 0)) - tile_state.SetInclusive(0, block_aggregate); - } - else - { - // Scan non-first tile - TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); - BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op); - - block_aggregate = prefix_op.GetBlockAggregate(); - num_segments_prefix = prefix_op.GetExclusivePrefix().key; - total_aggregate = reduction_op( - prefix_op.GetExclusivePrefix().value, - block_aggregate.value); - } - - // Rezip scatter items and segment indices - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - scatter_items[ITEM].key = prev_keys[ITEM]; - scatter_items[ITEM].value = scan_items[ITEM].value; - segment_indices[ITEM] = scan_items[ITEM].key; - } - - // At this point, each flagged segment head has: - // - The key for the previous segment - // - The reduced value from the previous segment - // - The segment index for the reduced value - - // Scatter flagged keys and values - OffsetT num_tile_segments = block_aggregate.key; - Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix); - - // Last thread in last tile will output final count (and last pair, if necessary) - if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1)) - { - OffsetT num_segments = num_segments_prefix + num_tile_segments; - - // If the last tile is a whole tile, output the final_value - if (num_remaining == TILE_ITEMS) - { - d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1]; - d_aggregates_out[num_segments] = total_aggregate; - num_segments++; - } - - // Output the total number of items selected - *d_num_runs_out = num_segments; - } - } - - - /** - * Scan tiles of items as part of a dynamic chained scan - */ - __device__ __forceinline__ void ConsumeRange( - int num_items, ///< Total number of input items - ScanTileStateT& tile_state, ///< Global tile state descriptor - int start_tile) ///< The starting tile for the current grid - { - // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = start_tile + blockIdx.x; // Current tile index - OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile - OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) - - if (num_remaining > TILE_ITEMS) - { - // Not last tile - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); - } - else if (num_remaining > 0) - { - // Last tile - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); - } - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_rle.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_rle.cuh deleted file mode 100644 index 0ba9216..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_rle.cuh +++ /dev/null @@ -1,837 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode. - */ - -#pragma once - -#include - -#include "single_pass_scan_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_exchange.cuh" -#include "../block/block_discontinuity.cuh" -#include "../grid/grid_queue.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../iterator/constant_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentRle - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentRlePolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode - */ -template < - typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for data - typename OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values - typename LengthsOutputIteratorT, ///< Random-access output iterator type for length values - typename EqualityOpT, ///< T equality operator type - typename OffsetT> ///< Signed integer type for global offsets -struct AgentRle -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// The input value type - typedef typename std::iterator_traits::value_type T; - - /// The lengths output value type - typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? - OffsetT, // ... then the OffsetT type, - typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type - - /// Tuple type for scanning (pairs run-length and run-index) - typedef KeyValuePair LengthOffsetPair; - - /// Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - // Constants - enum - { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD, - WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - /// Whether or not to sync after loading data - SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), - - /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) - STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING, - ACTIVE_EXCHANGE_WARPS = (STORE_WARP_TIME_SLICING) ? 1 : WARPS, - }; - - - /** - * Special operator that signals all out-of-bounds items are not equal to everything else, - * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked - * trivial. - */ - template - struct OobInequalityOp - { - OffsetT num_remaining; - EqualityOpT equality_op; - - __device__ __forceinline__ OobInequalityOp( - OffsetT num_remaining, - EqualityOpT equality_op) - : - num_remaining(num_remaining), - equality_op(equality_op) - {} - - template - __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx) - { - if (!LAST_TILE || (idx < num_remaining)) - return !equality_op(first, second); - else - return true; - } - }; - - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for data - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedVLengthnputIterator - InputIteratorT>::Type // Directly use the supplied input iterator type - WrappedInputIteratorT; - - // Parameterized BlockLoad type for data - typedef BlockLoad< - T, - AgentRlePolicyT::BLOCK_THREADS, - AgentRlePolicyT::ITEMS_PER_THREAD, - AgentRlePolicyT::LOAD_ALGORITHM> - BlockLoadT; - - // Parameterized BlockDiscontinuity type for data - typedef BlockDiscontinuity BlockDiscontinuityT; - - // Parameterized WarpScan type - typedef WarpScan WarpScanPairs; - - // Reduce-length-by-run scan operator - typedef ReduceBySegmentOp ReduceBySegmentOpT; - - // Callback type for obtaining tile prefix during block scan - typedef TilePrefixCallbackOp< - LengthOffsetPair, - ReduceBySegmentOpT, - ScanTileStateT> - TilePrefixCallbackOpT; - - // Warp exchange types - typedef WarpExchange WarpExchangePairs; - - typedef typename If::Type WarpExchangePairsStorage; - - typedef WarpExchange WarpExchangeOffsets; - typedef WarpExchange WarpExchangeLengths; - - typedef LengthOffsetPair WarpAggregates[WARPS]; - - // Shared memory type for this thread block - struct _TempStorage - { - // Aliasable storage layout - union Aliasable - { - struct - { - typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection - typename WarpScanPairs::TempStorage warp_scan[WARPS]; // Smem needed for warp-synchronous scans - Uninitialized warp_aggregates; // Smem needed for sharing warp-wide aggregates - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback - }; - - // Smem needed for input loading - typename BlockLoadT::TempStorage load; - - // Aliasable layout needed for two-phase scatter - union ScatterAliasable - { - unsigned long long align; - WarpExchangePairsStorage exchange_pairs[ACTIVE_EXCHANGE_WARPS]; - typename WarpExchangeOffsets::TempStorage exchange_offsets[ACTIVE_EXCHANGE_WARPS]; - typename WarpExchangeLengths::TempStorage exchange_lengths[ACTIVE_EXCHANGE_WARPS]; - - } scatter_aliasable; - - } aliasable; - - OffsetT tile_idx; // Shared tile index - LengthOffsetPair tile_inclusive; // Inclusive tile prefix - LengthOffsetPair tile_exclusive; // Exclusive tile prefix - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - - WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets - LengthsOutputIteratorT d_lengths_out; ///< Output run lengths - - EqualityOpT equality_op; ///< T equality operator - ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator - OffsetT num_items; ///< Total number of input items - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - AgentRle( - TempStorage &temp_storage, ///< [in] Reference to temp_storage - InputIteratorT d_in, ///< [in] Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run offsets - LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run lengths - EqualityOpT equality_op, ///< [in] T equality operator - OffsetT num_items) ///< [in] Total number of input items - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_offsets_out(d_offsets_out), - d_lengths_out(d_lengths_out), - equality_op(equality_op), - scan_op(cub::Sum()), - num_items(num_items) - {} - - - //--------------------------------------------------------------------- - // Utility methods for initializing the selections - //--------------------------------------------------------------------- - - template - __device__ __forceinline__ void InitializeSelections( - OffsetT tile_offset, - OffsetT num_remaining, - T (&items)[ITEMS_PER_THREAD], - LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) - { - bool head_flags[ITEMS_PER_THREAD]; - bool tail_flags[ITEMS_PER_THREAD]; - - OobInequalityOp inequality_op(num_remaining, equality_op); - - if (FIRST_TILE && LAST_TILE) - { - // First-and-last-tile always head-flags the first item and tail-flags the last item - - BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( - head_flags, tail_flags, items, inequality_op); - } - else if (FIRST_TILE) - { - // First-tile always head-flags the first item - - // Get the first item from the next tile - T tile_successor_item; - if (threadIdx.x == BLOCK_THREADS - 1) - tile_successor_item = d_in[tile_offset + TILE_ITEMS]; - - BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( - head_flags, tail_flags, tile_successor_item, items, inequality_op); - } - else if (LAST_TILE) - { - // Last-tile always flags the last item - - // Get the last item from the previous tile - T tile_predecessor_item; - if (threadIdx.x == 0) - tile_predecessor_item = d_in[tile_offset - 1]; - - BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( - head_flags, tile_predecessor_item, tail_flags, items, inequality_op); - } - else - { - // Get the first item from the next tile - T tile_successor_item; - if (threadIdx.x == BLOCK_THREADS - 1) - tile_successor_item = d_in[tile_offset + TILE_ITEMS]; - - // Get the last item from the previous tile - T tile_predecessor_item; - if (threadIdx.x == 0) - tile_predecessor_item = d_in[tile_offset - 1]; - - BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( - head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op); - } - - // Zip counts and runs - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - lengths_and_num_runs[ITEM].key = head_flags[ITEM] && (!tail_flags[ITEM]); - lengths_and_num_runs[ITEM].value = ((!head_flags[ITEM]) || (!tail_flags[ITEM])); - } - } - - //--------------------------------------------------------------------- - // Scan utility methods - //--------------------------------------------------------------------- - - /** - * Scan of allocations - */ - __device__ __forceinline__ void WarpScanAllocations( - LengthOffsetPair &tile_aggregate, - LengthOffsetPair &warp_aggregate, - LengthOffsetPair &warp_exclusive_in_tile, - LengthOffsetPair &thread_exclusive_in_warp, - LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) - { - // Perform warpscans - unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); - - LengthOffsetPair identity; - identity.key = 0; - identity.value = 0; - - LengthOffsetPair thread_inclusive; - LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op); - WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan( - thread_aggregate, - thread_inclusive, - thread_exclusive_in_warp, - identity, - scan_op); - - // Last lane in each warp shares its warp-aggregate - if (lane_id == WARP_THREADS - 1) - temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive; - - CTA_SYNC(); - - // Accumulate total selected and the warp-wide prefix - warp_exclusive_in_tile = identity; - warp_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[warp_id]; - tile_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[0]; - - #pragma unroll - for (int WARP = 1; WARP < WARPS; ++WARP) - { - if (warp_id == WARP) - warp_exclusive_in_tile = tile_aggregate; - - tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]); - } - } - - - //--------------------------------------------------------------------- - // Utility methods for scattering selections - //--------------------------------------------------------------------- - - /** - * Two-phase scatter, specialized for warp time-slicing - */ - template - __device__ __forceinline__ void ScatterTwoPhase( - OffsetT tile_num_runs_exclusive_in_global, - OffsetT warp_num_runs_aggregate, - OffsetT warp_num_runs_exclusive_in_tile, - OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], - LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], - Int2Type is_warp_time_slice) - { - unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); - - // Locally compact items within the warp (first warp) - if (warp_id == 0) - { - WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( - lengths_and_offsets, thread_num_runs_exclusive_in_warp); - } - - // Locally compact items within the warp (remaining warps) - #pragma unroll - for (int SLICE = 1; SLICE < WARPS; ++SLICE) - { - CTA_SYNC(); - - if (warp_id == SLICE) - { - WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( - lengths_and_offsets, thread_num_runs_exclusive_in_warp); - } - } - - // Global scatter - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id) - { - OffsetT item_offset = - tile_num_runs_exclusive_in_global + - warp_num_runs_exclusive_in_tile + - (ITEM * WARP_THREADS) + lane_id; - - // Scatter offset - d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; - - // Scatter length if not the first (global) length - if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) - { - d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; - } - } - } - } - - - /** - * Two-phase scatter - */ - template - __device__ __forceinline__ void ScatterTwoPhase( - OffsetT tile_num_runs_exclusive_in_global, - OffsetT warp_num_runs_aggregate, - OffsetT warp_num_runs_exclusive_in_tile, - OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], - LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], - Int2Type is_warp_time_slice) - { - unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); - - // Unzip - OffsetT run_offsets[ITEMS_PER_THREAD]; - LengthT run_lengths[ITEMS_PER_THREAD]; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - run_offsets[ITEM] = lengths_and_offsets[ITEM].key; - run_lengths[ITEM] = lengths_and_offsets[ITEM].value; - } - - WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped( - run_offsets, thread_num_runs_exclusive_in_warp); - - WARP_SYNC(0xffffffff); - - WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped( - run_lengths, thread_num_runs_exclusive_in_warp); - - // Global scatter - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate) - { - OffsetT item_offset = - tile_num_runs_exclusive_in_global + - warp_num_runs_exclusive_in_tile + - (ITEM * WARP_THREADS) + lane_id; - - // Scatter offset - d_offsets_out[item_offset] = run_offsets[ITEM]; - - // Scatter length if not the first (global) length - if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) - { - d_lengths_out[item_offset - 1] = run_lengths[ITEM]; - } - } - } - } - - - /** - * Direct scatter - */ - template - __device__ __forceinline__ void ScatterDirect( - OffsetT tile_num_runs_exclusive_in_global, - OffsetT warp_num_runs_aggregate, - OffsetT warp_num_runs_exclusive_in_tile, - OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], - LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate) - { - OffsetT item_offset = - tile_num_runs_exclusive_in_global + - warp_num_runs_exclusive_in_tile + - thread_num_runs_exclusive_in_warp[ITEM]; - - // Scatter offset - d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; - - // Scatter length if not the first (global) length - if (item_offset >= 1) - { - d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; - } - } - } - } - - - /** - * Scatter - */ - template - __device__ __forceinline__ void Scatter( - OffsetT tile_num_runs_aggregate, - OffsetT tile_num_runs_exclusive_in_global, - OffsetT warp_num_runs_aggregate, - OffsetT warp_num_runs_exclusive_in_tile, - OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], - LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) - { - if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS)) - { - // Direct scatter if the warp has any items - if (warp_num_runs_aggregate) - { - ScatterDirect( - tile_num_runs_exclusive_in_global, - warp_num_runs_aggregate, - warp_num_runs_exclusive_in_tile, - thread_num_runs_exclusive_in_warp, - lengths_and_offsets); - } - } - else - { - // Scatter two phase - ScatterTwoPhase( - tile_num_runs_exclusive_in_global, - warp_num_runs_aggregate, - warp_num_runs_exclusive_in_tile, - thread_num_runs_exclusive_in_warp, - lengths_and_offsets, - Int2Type()); - } - } - - - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - /** - * Process a tile of input (dynamic chained scan) - */ - template < - bool LAST_TILE> - __device__ __forceinline__ LengthOffsetPair ConsumeTile( - OffsetT num_items, ///< Total number of global input items - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT &tile_status) ///< Global list of tile status - { - if (tile_idx == 0) - { - // First tile - - // Load items - T items[ITEMS_PER_THREAD]; - if (LAST_TILE) - BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); - else - BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); - - if (SYNC_AFTER_LOAD) - CTA_SYNC(); - - // Set flags - LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; - - InitializeSelections( - tile_offset, - num_remaining, - items, - lengths_and_num_runs); - - // Exclusive scan of lengths and runs - LengthOffsetPair tile_aggregate; - LengthOffsetPair warp_aggregate; - LengthOffsetPair warp_exclusive_in_tile; - LengthOffsetPair thread_exclusive_in_warp; - - WarpScanAllocations( - tile_aggregate, - warp_aggregate, - warp_exclusive_in_tile, - thread_exclusive_in_warp, - lengths_and_num_runs); - - // Update tile status if this is not the last tile - if (!LAST_TILE && (threadIdx.x == 0)) - tile_status.SetInclusive(0, tile_aggregate); - - // Update thread_exclusive_in_warp to fold in warp run-length - if (thread_exclusive_in_warp.key == 0) - thread_exclusive_in_warp.value += warp_exclusive_in_tile.value; - - LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; - OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; - LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; - - // Downsweep scan through lengths_and_num_runs - internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); - - // Zip - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; - lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; - thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? - lengths_and_num_runs2[ITEM].key : // keep - WARP_THREADS * ITEMS_PER_THREAD; // discard - } - - OffsetT tile_num_runs_aggregate = tile_aggregate.key; - OffsetT tile_num_runs_exclusive_in_global = 0; - OffsetT warp_num_runs_aggregate = warp_aggregate.key; - OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; - - // Scatter - Scatter( - tile_num_runs_aggregate, - tile_num_runs_exclusive_in_global, - warp_num_runs_aggregate, - warp_num_runs_exclusive_in_tile, - thread_num_runs_exclusive_in_warp, - lengths_and_offsets); - - // Return running total (inclusive of this tile) - return tile_aggregate; - } - else - { - // Not first tile - - // Load items - T items[ITEMS_PER_THREAD]; - if (LAST_TILE) - BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); - else - BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); - - if (SYNC_AFTER_LOAD) - CTA_SYNC(); - - // Set flags - LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; - - InitializeSelections( - tile_offset, - num_remaining, - items, - lengths_and_num_runs); - - // Exclusive scan of lengths and runs - LengthOffsetPair tile_aggregate; - LengthOffsetPair warp_aggregate; - LengthOffsetPair warp_exclusive_in_tile; - LengthOffsetPair thread_exclusive_in_warp; - - WarpScanAllocations( - tile_aggregate, - warp_aggregate, - warp_exclusive_in_tile, - thread_exclusive_in_warp, - lengths_and_num_runs); - - // First warp computes tile prefix in lane 0 - TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx); - unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - if (warp_id == 0) - { - prefix_op(tile_aggregate); - if (threadIdx.x == 0) - temp_storage.tile_exclusive = prefix_op.exclusive_prefix; - } - - CTA_SYNC(); - - LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive; - - // Update thread_exclusive_in_warp to fold in warp and tile run-lengths - LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile); - if (thread_exclusive_in_warp.key == 0) - thread_exclusive_in_warp.value += thread_exclusive.value; - - // Downsweep scan through lengths_and_num_runs - LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; - LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; - OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; - - internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); - - // Zip - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; - lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; - thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? - lengths_and_num_runs2[ITEM].key : // keep - WARP_THREADS * ITEMS_PER_THREAD; // discard - } - - OffsetT tile_num_runs_aggregate = tile_aggregate.key; - OffsetT tile_num_runs_exclusive_in_global = tile_exclusive_in_global.key; - OffsetT warp_num_runs_aggregate = warp_aggregate.key; - OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; - - // Scatter - Scatter( - tile_num_runs_aggregate, - tile_num_runs_exclusive_in_global, - warp_num_runs_aggregate, - warp_num_runs_exclusive_in_tile, - thread_num_runs_exclusive_in_warp, - lengths_and_offsets); - - // Return running total (inclusive of this tile) - return prefix_op.inclusive_prefix; - } - } - - - /** - * Scan tiles of items as part of a dynamic chained scan - */ - template ///< Output iterator type for recording number of items selected - __device__ __forceinline__ void ConsumeRange( - int num_tiles, ///< Total number of input tiles - ScanTileStateT& tile_status, ///< Global list of tile status - NumRunsIteratorT d_num_runs_out) ///< Output pointer for total number of runs identified - { - // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile - OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) - - if (tile_idx < num_tiles - 1) - { - // Not the last tile (full) - ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); - } - else if (num_remaining > 0) - { - // The last tile (possibly partially-full) - LengthOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); - - if (threadIdx.x == 0) - { - // Output the total number of items selected - *d_num_runs_out = running_total.key; - - // The inclusive prefix contains accumulated length reduction for the last run - if (running_total.key > 0) - d_lengths_out[running_total.key - 1] = running_total.value; - } - } - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_scan.cuh deleted file mode 100644 index 567df80..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_scan.cuh +++ /dev/null @@ -1,471 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan . - */ - -#pragma once - -#include - -#include "single_pass_scan_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../grid/grid_queue.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentScan - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentScanPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan . - */ -template < - typename AgentScanPolicyT, ///< Parameterized AgentScanPolicyT tuning policy type - typename InputIteratorT, ///< Random-access input iterator type - typename OutputIteratorT, ///< Random-access output iterator type - typename ScanOpT, ///< Scan functor type - typename InitValueT, ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan) - typename OffsetT> ///< Signed integer type for global offsets -struct AgentScan -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // Tile status descriptor interface type - typedef ScanTileState ScanTileStateT; - - // Input iterator wrapper type (for applying cache modifier) - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - InputIteratorT>::Type // Directly use the supplied input iterator type - WrappedInputIteratorT; - - // Constants - enum - { - IS_INCLUSIVE = Equals::VALUE, // Inclusive scan if no init_value type is provided - BLOCK_THREADS = AgentScanPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - // Parameterized BlockLoad type - typedef BlockLoad< - OutputT, - AgentScanPolicyT::BLOCK_THREADS, - AgentScanPolicyT::ITEMS_PER_THREAD, - AgentScanPolicyT::LOAD_ALGORITHM> - BlockLoadT; - - // Parameterized BlockStore type - typedef BlockStore< - OutputT, - AgentScanPolicyT::BLOCK_THREADS, - AgentScanPolicyT::ITEMS_PER_THREAD, - AgentScanPolicyT::STORE_ALGORITHM> - BlockStoreT; - - // Parameterized BlockScan type - typedef BlockScan< - OutputT, - AgentScanPolicyT::BLOCK_THREADS, - AgentScanPolicyT::SCAN_ALGORITHM> - BlockScanT; - - // Callback type for obtaining tile prefix during block scan - typedef TilePrefixCallbackOp< - OutputT, - ScanOpT, - ScanTileStateT> - TilePrefixCallbackOpT; - - // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles - typedef BlockScanRunningPrefixOp< - OutputT, - ScanOpT> - RunningPrefixCallbackOp; - - // Shared memory type for this thread block - union _TempStorage - { - typename BlockLoadT::TempStorage load; // Smem needed for tile loading - typename BlockStoreT::TempStorage store; // Smem needed for tile storing - - struct - { - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback - typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - }; - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - WrappedInputIteratorT d_in; ///< Input data - OutputIteratorT d_out; ///< Output data - ScanOpT scan_op; ///< Binary scan operator - InitValueT init_value; ///< The init_value element for ScanOpT - - - //--------------------------------------------------------------------- - // Block scan utility methods - //--------------------------------------------------------------------- - - /** - * Exclusive scan specialization (first tile) - */ - __device__ __forceinline__ - void ScanTile( - OutputT (&items)[ITEMS_PER_THREAD], - OutputT init_value, - ScanOpT scan_op, - OutputT &block_aggregate, - Int2Type /*is_inclusive*/) - { - BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate); - block_aggregate = scan_op(init_value, block_aggregate); - } - - - /** - * Inclusive scan specialization (first tile) - */ - __device__ __forceinline__ - void ScanTile( - OutputT (&items)[ITEMS_PER_THREAD], - InitValueT /*init_value*/, - ScanOpT scan_op, - OutputT &block_aggregate, - Int2Type /*is_inclusive*/) - { - BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); - } - - - /** - * Exclusive scan specialization (subsequent tiles) - */ - template - __device__ __forceinline__ - void ScanTile( - OutputT (&items)[ITEMS_PER_THREAD], - ScanOpT scan_op, - PrefixCallback &prefix_op, - Int2Type /*is_inclusive*/) - { - BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op); - } - - - /** - * Inclusive scan specialization (subsequent tiles) - */ - template - __device__ __forceinline__ - void ScanTile( - OutputT (&items)[ITEMS_PER_THREAD], - ScanOpT scan_op, - PrefixCallback &prefix_op, - Int2Type /*is_inclusive*/) - { - BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op); - } - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - AgentScan( - TempStorage& temp_storage, ///< Reference to temp_storage - InputIteratorT d_in, ///< Input data - OutputIteratorT d_out, ///< Output data - ScanOpT scan_op, ///< Binary scan operator - InitValueT init_value) ///< Initial value to seed the exclusive scan - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_out(d_out), - scan_op(scan_op), - init_value(init_value) - {} - - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - /** - * Process a tile of input (dynamic chained scan) - */ - template ///< Whether the current tile is the last tile - __device__ __forceinline__ void ConsumeTile( - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - // Load items - OutputT items[ITEMS_PER_THREAD]; - - if (IS_LAST_TILE) - BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining); - else - BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); - - CTA_SYNC(); - - // Perform tile scan - if (tile_idx == 0) - { - // Scan first tile - OutputT block_aggregate; - ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); - if ((!IS_LAST_TILE) && (threadIdx.x == 0)) - tile_state.SetInclusive(0, block_aggregate); - } - else - { - // Scan non-first tile - TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); - ScanTile(items, scan_op, prefix_op, Int2Type()); - } - - CTA_SYNC(); - - // Store items - if (IS_LAST_TILE) - BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining); - else - BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); - } - - - /** - * Scan tiles of items as part of a dynamic chained scan - */ - __device__ __forceinline__ void ConsumeRange( - int num_items, ///< Total number of input items - ScanTileStateT& tile_state, ///< Global tile state descriptor - int start_tile) ///< The starting tile for the current grid - { - // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = start_tile + blockIdx.x; // Current tile index - OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile - OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) - - if (num_remaining > TILE_ITEMS) - { - // Not last tile - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); - } - else if (num_remaining > 0) - { - // Last tile - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); - } - } - - - //--------------------------------------------------------------------- - // Scan an sequence of consecutive tiles (independent of other thread blocks) - //--------------------------------------------------------------------- - - /** - * Process a tile of input - */ - template < - bool IS_FIRST_TILE, - bool IS_LAST_TILE> - __device__ __forceinline__ void ConsumeTile( - OffsetT tile_offset, ///< Tile offset - RunningPrefixCallbackOp& prefix_op, ///< Running prefix operator - int valid_items = TILE_ITEMS) ///< Number of valid items in the tile - { - // Load items - OutputT items[ITEMS_PER_THREAD]; - - if (IS_LAST_TILE) - BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items); - else - BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); - - CTA_SYNC(); - - // Block scan - if (IS_FIRST_TILE) - { - OutputT block_aggregate; - ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); - prefix_op.running_total = block_aggregate; - } - else - { - ScanTile(items, scan_op, prefix_op, Int2Type()); - } - - CTA_SYNC(); - - // Store items - if (IS_LAST_TILE) - BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items); - else - BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); - } - - - /** - * Scan a consecutive share of input tiles - */ - __device__ __forceinline__ void ConsumeRange( - OffsetT range_offset, ///< [in] Threadblock begin offset (inclusive) - OffsetT range_end) ///< [in] Threadblock end offset (exclusive) - { - BlockScanRunningPrefixOp prefix_op(scan_op); - - if (range_offset + TILE_ITEMS <= range_end) - { - // Consume first tile of input (full) - ConsumeTile(range_offset, prefix_op); - range_offset += TILE_ITEMS; - - // Consume subsequent full tiles of input - while (range_offset + TILE_ITEMS <= range_end) - { - ConsumeTile(range_offset, prefix_op); - range_offset += TILE_ITEMS; - } - - // Consume a partially-full tile - if (range_offset < range_end) - { - int valid_items = range_end - range_offset; - ConsumeTile(range_offset, prefix_op, valid_items); - } - } - else - { - // Consume the first tile of input (partially-full) - int valid_items = range_end - range_offset; - ConsumeTile(range_offset, prefix_op, valid_items); - } - } - - - /** - * Scan a consecutive share of input tiles, seeded with the specified prefix value - */ - __device__ __forceinline__ void ConsumeRange( - OffsetT range_offset, ///< [in] Threadblock begin offset (inclusive) - OffsetT range_end, ///< [in] Threadblock end offset (exclusive) - OutputT prefix) ///< [in] The prefix to apply to the scan segment - { - BlockScanRunningPrefixOp prefix_op(prefix, scan_op); - - // Consume full tiles of input - while (range_offset + TILE_ITEMS <= range_end) - { - ConsumeTile(range_offset, prefix_op); - range_offset += TILE_ITEMS; - } - - // Consume a partially-full tile - if (range_offset < range_end) - { - int valid_items = range_end - range_offset; - ConsumeTile(range_offset, prefix_op, valid_items); - } - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_segment_fixup.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_segment_fixup.cuh deleted file mode 100644 index efa6d86..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_segment_fixup.cuh +++ /dev/null @@ -1,375 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. - */ - -#pragma once - -#include - -#include "single_pass_scan_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_discontinuity.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../iterator/constant_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentSegmentFixup - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentSegmentFixupPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key - */ -template < - typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type - typename PairsInputIteratorT, ///< Random-access input iterator type for keys - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename EqualityOpT, ///< KeyT equality operator type - typename ReductionOpT, ///< ValueT reduction operator type - typename OffsetT> ///< Signed integer type for global offsets -struct AgentSegmentFixup -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Data type of key-value input iterator - typedef typename std::iterator_traits::value_type KeyValuePairT; - - // Value type - typedef typename KeyValuePairT::Value ValueT; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - // Constants - enum - { - BLOCK_THREADS = AgentSegmentFixupPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - // Whether or not do fixup using RLE + global atomics - USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) && - (Equals::VALUE || - Equals::VALUE || - Equals::VALUE || - Equals::VALUE), - - // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) - HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), - }; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - PairsInputIteratorT>::Type // Directly use the supplied input iterator type - WrappedPairsInputIteratorT; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - AggregatesOutputIteratorT>::Type // Directly use the supplied input iterator type - WrappedFixupInputIteratorT; - - // Reduce-value-by-segment scan operator - typedef ReduceByKeyOp ReduceBySegmentOpT; - - // Parameterized BlockLoad type for pairs - typedef BlockLoad< - KeyValuePairT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - AgentSegmentFixupPolicyT::LOAD_ALGORITHM> - BlockLoadPairs; - - // Parameterized BlockScan type - typedef BlockScan< - KeyValuePairT, - BLOCK_THREADS, - AgentSegmentFixupPolicyT::SCAN_ALGORITHM> - BlockScanT; - - // Callback type for obtaining tile prefix during block scan - typedef TilePrefixCallbackOp< - KeyValuePairT, - ReduceBySegmentOpT, - ScanTileStateT> - TilePrefixCallbackOpT; - - // Shared memory type for this thread block - union _TempStorage - { - struct - { - typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback - }; - - // Smem needed for loading keys - typename BlockLoadPairs::TempStorage load_pairs; - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - WrappedPairsInputIteratorT d_pairs_in; ///< Input keys - AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates - WrappedFixupInputIteratorT d_fixup_in; ///< Fixup input values - InequalityWrapper inequality_op; ///< KeyT inequality operator - ReductionOpT reduction_op; ///< Reduction operator - ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - AgentSegmentFixup( - TempStorage& temp_storage, ///< Reference to temp_storage - PairsInputIteratorT d_pairs_in, ///< Input keys - AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates - EqualityOpT equality_op, ///< KeyT equality operator - ReductionOpT reduction_op) ///< ValueT reduction operator - : - temp_storage(temp_storage.Alias()), - d_pairs_in(d_pairs_in), - d_aggregates_out(d_aggregates_out), - d_fixup_in(d_aggregates_out), - inequality_op(equality_op), - reduction_op(reduction_op), - scan_op(reduction_op) - {} - - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - - /** - * Process input tile. Specialized for atomic-fixup - */ - template - __device__ __forceinline__ void ConsumeTile( - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state, ///< Global tile state descriptor - Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) - { - KeyValuePairT pairs[ITEMS_PER_THREAD]; - - // Load pairs - KeyValuePairT oob_pair; - oob_pair.key = -1; - - if (IS_LAST_TILE) - BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); - else - BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); - - // RLE - #pragma unroll - for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key; - if (pairs[ITEM].key != pairs[ITEM - 1].key) - atomicAdd(d_scatter, pairs[ITEM - 1].value); - else - pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value); - } - - // Flush last item if valid - ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key; - if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0)) - atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value); - } - - - /** - * Process input tile. Specialized for reduce-by-key fixup - */ - template - __device__ __forceinline__ void ConsumeTile( - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state, ///< Global tile state descriptor - Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) - { - KeyValuePairT pairs[ITEMS_PER_THREAD]; - KeyValuePairT scatter_pairs[ITEMS_PER_THREAD]; - - // Load pairs - KeyValuePairT oob_pair; - oob_pair.key = -1; - - if (IS_LAST_TILE) - BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); - else - BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); - - CTA_SYNC(); - - KeyValuePairT tile_aggregate; - if (tile_idx == 0) - { - // Exclusive scan of values and segment_flags - BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate); - - // Update tile status if this is not the last tile - if (threadIdx.x == 0) - { - // Set first segment id to not trigger a flush (invalid from exclusive scan) - scatter_pairs[0].key = pairs[0].key; - - if (!IS_LAST_TILE) - tile_state.SetInclusive(0, tile_aggregate); - - } - } - else - { - // Exclusive scan of values and segment_flags - TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); - BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op); - tile_aggregate = prefix_op.GetBlockAggregate(); - } - - // Scatter updated values - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (scatter_pairs[ITEM].key != pairs[ITEM].key) - { - // Update the value at the key location - ValueT value = d_fixup_in[scatter_pairs[ITEM].key]; - value = reduction_op(value, scatter_pairs[ITEM].value); - - d_aggregates_out[scatter_pairs[ITEM].key] = value; - } - } - - // Finalize the last item - if (IS_LAST_TILE) - { - // Last thread will output final count and last item, if necessary - if (threadIdx.x == BLOCK_THREADS - 1) - { - // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment - if (num_remaining == TILE_ITEMS) - { - // Update the value at the key location - OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key; - d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]); - } - } - } - } - - - /** - * Scan tiles of items as part of a dynamic chained scan - */ - __device__ __forceinline__ void ConsumeRange( - int num_items, ///< Total number of input items - int num_tiles, ///< Total number of input tiles - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile - OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) - - if (num_remaining > TILE_ITEMS) - { - // Not the last tile (full) - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); - } - else if (num_remaining > 0) - { - // The last tile (possibly partially-full) - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); - } - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_select_if.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_select_if.cuh deleted file mode 100644 index f365481..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_select_if.cuh +++ /dev/null @@ -1,703 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select. - */ - -#pragma once - -#include - -#include "single_pass_scan_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_exchange.cuh" -#include "../block/block_discontinuity.cuh" -#include "../grid/grid_queue.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentSelectIf - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentSelectIfPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - - -/** - * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection - * - * Performs functor-based selection if SelectOpT functor type != NullType - * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType - * Otherwise performs discontinuity selection (keep unique) - */ -template < - typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for selection items - typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename SelectedOutputIteratorT, ///< Random-access input iterator type for selection_flags items - typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) - typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) - typename OffsetT, ///< Signed integer type for global offsets - bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output -struct AgentSelectIf -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // The flag value type - typedef typename std::iterator_traits::value_type FlagT; - - // Tile status descriptor interface type - typedef ScanTileState ScanTileStateT; - - // Constants - enum - { - USE_SELECT_OP, - USE_SELECT_FLAGS, - USE_DISCONTINUITY, - - BLOCK_THREADS = AgentSelectIfPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentSelectIfPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), - - SELECT_METHOD = (!Equals::VALUE) ? - USE_SELECT_OP : - (!Equals::VALUE) ? - USE_SELECT_FLAGS : - USE_DISCONTINUITY - }; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for items - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - InputIteratorT>::Type // Directly use the supplied input iterator type - WrappedInputIteratorT; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for values - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - FlagsInputIteratorT>::Type // Directly use the supplied input iterator type - WrappedFlagsInputIteratorT; - - // Parameterized BlockLoad type for input data - typedef BlockLoad< - OutputT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - AgentSelectIfPolicyT::LOAD_ALGORITHM> - BlockLoadT; - - // Parameterized BlockLoad type for flags - typedef BlockLoad< - FlagT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - AgentSelectIfPolicyT::LOAD_ALGORITHM> - BlockLoadFlags; - - // Parameterized BlockDiscontinuity type for items - typedef BlockDiscontinuity< - OutputT, - BLOCK_THREADS> - BlockDiscontinuityT; - - // Parameterized BlockScan type - typedef BlockScan< - OffsetT, - BLOCK_THREADS, - AgentSelectIfPolicyT::SCAN_ALGORITHM> - BlockScanT; - - // Callback type for obtaining tile prefix during block scan - typedef TilePrefixCallbackOp< - OffsetT, - cub::Sum, - ScanTileStateT> - TilePrefixCallbackOpT; - - // Item exchange type - typedef OutputT ItemExchangeT[TILE_ITEMS]; - - // Shared memory type for this thread block - union _TempStorage - { - struct - { - typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback - typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection - }; - - // Smem needed for loading items - typename BlockLoadT::TempStorage load_items; - - // Smem needed for loading values - typename BlockLoadFlags::TempStorage load_flags; - - // Smem needed for compacting items (allows non POD items in this union) - Uninitialized raw_exchange; - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - WrappedInputIteratorT d_in; ///< Input items - SelectedOutputIteratorT d_selected_out; ///< Unique output items - WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable) - InequalityWrapper inequality_op; ///< T inequality operator - SelectOpT select_op; ///< Selection operator - OffsetT num_items; ///< Total number of input items - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - AgentSelectIf( - TempStorage &temp_storage, ///< Reference to temp_storage - InputIteratorT d_in, ///< Input data - FlagsInputIteratorT d_flags_in, ///< Input selection flags (if applicable) - SelectedOutputIteratorT d_selected_out, ///< Output data - SelectOpT select_op, ///< Selection operator - EqualityOpT equality_op, ///< Equality operator - OffsetT num_items) ///< Total number of input items - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_flags_in(d_flags_in), - d_selected_out(d_selected_out), - select_op(select_op), - inequality_op(equality_op), - num_items(num_items) - {} - - - //--------------------------------------------------------------------- - // Utility methods for initializing the selections - //--------------------------------------------------------------------- - - /** - * Initialize selections (specialized for selection operator) - */ - template - __device__ __forceinline__ void InitializeSelections( - OffsetT /*tile_offset*/, - OffsetT num_tile_items, - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - Int2Type /*select_method*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - // Out-of-bounds items are selection_flags - selection_flags[ITEM] = 1; - - if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items)) - selection_flags[ITEM] = select_op(items[ITEM]); - } - } - - - /** - * Initialize selections (specialized for valid flags) - */ - template - __device__ __forceinline__ void InitializeSelections( - OffsetT tile_offset, - OffsetT num_tile_items, - OutputT (&/*items*/)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - Int2Type /*select_method*/) - { - CTA_SYNC(); - - FlagT flags[ITEMS_PER_THREAD]; - - if (IS_LAST_TILE) - { - // Out-of-bounds items are selection_flags - BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1); - } - else - { - BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags); - } - - // Convert flag type to selection_flags type - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - selection_flags[ITEM] = flags[ITEM]; - } - } - - - /** - * Initialize selections (specialized for discontinuity detection) - */ - template - __device__ __forceinline__ void InitializeSelections( - OffsetT tile_offset, - OffsetT num_tile_items, - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - Int2Type /*select_method*/) - { - if (IS_FIRST_TILE) - { - CTA_SYNC(); - - // Set head selection_flags. First tile sets the first flag for the first item - BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op); - } - else - { - OutputT tile_predecessor; - if (threadIdx.x == 0) - tile_predecessor = d_in[tile_offset - 1]; - - CTA_SYNC(); - - BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor); - } - - // Set selection flags for out-of-bounds items - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - // Set selection_flags for out-of-bounds items - if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) - selection_flags[ITEM] = 1; - } - } - - - //--------------------------------------------------------------------- - // Scatter utility methods - //--------------------------------------------------------------------- - - /** - * Scatter flagged items to output offsets (specialized for direct scattering) - */ - template - __device__ __forceinline__ void ScatterDirect( - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - OffsetT (&selection_indices)[ITEMS_PER_THREAD], - OffsetT num_selections) - { - // Scatter flagged items - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (selection_flags[ITEM]) - { - if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections) - { - d_selected_out[selection_indices[ITEM]] = items[ITEM]; - } - } - } - } - - - /** - * Scatter flagged items to output offsets (specialized for two-phase scattering) - */ - template - __device__ __forceinline__ void ScatterTwoPhase( - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - OffsetT (&selection_indices)[ITEMS_PER_THREAD], - int /*num_tile_items*/, ///< Number of valid items in this tile - int num_tile_selections, ///< Number of selections in this tile - OffsetT num_selections_prefix, ///< Total number of selections prior to this tile - OffsetT /*num_rejected_prefix*/, ///< Total number of rejections prior to this tile - Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition - { - CTA_SYNC(); - - // Compact and scatter items - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix; - if (selection_flags[ITEM]) - { - temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; - } - } - - CTA_SYNC(); - - for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) - { - d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item]; - } - } - - - /** - * Scatter flagged items to output offsets (specialized for two-phase scattering) - */ - template - __device__ __forceinline__ void ScatterTwoPhase( - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - OffsetT (&selection_indices)[ITEMS_PER_THREAD], - int num_tile_items, ///< Number of valid items in this tile - int num_tile_selections, ///< Number of selections in this tile - OffsetT num_selections_prefix, ///< Total number of selections prior to this tile - OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile - Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition - { - CTA_SYNC(); - - int tile_num_rejections = num_tile_items - num_tile_selections; - - // Scatter items to shared memory (rejections first) - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM; - int local_selection_idx = selection_indices[ITEM] - num_selections_prefix; - int local_rejection_idx = item_idx - local_selection_idx; - int local_scatter_offset = (selection_flags[ITEM]) ? - tile_num_rejections + local_selection_idx : - local_rejection_idx; - - temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; - } - - CTA_SYNC(); - - // Gather items from shared memory and scatter to global - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; - int rejection_idx = item_idx; - int selection_idx = item_idx - tile_num_rejections; - OffsetT scatter_offset = (item_idx < tile_num_rejections) ? - num_items - num_rejected_prefix - rejection_idx - 1 : - num_selections_prefix + selection_idx; - - OutputT item = temp_storage.raw_exchange.Alias()[item_idx]; - - if (!IS_LAST_TILE || (item_idx < num_tile_items)) - { - d_selected_out[scatter_offset] = item; - } - } - } - - - /** - * Scatter flagged items - */ - template - __device__ __forceinline__ void Scatter( - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - OffsetT (&selection_indices)[ITEMS_PER_THREAD], - int num_tile_items, ///< Number of valid items in this tile - int num_tile_selections, ///< Number of selections in this tile - OffsetT num_selections_prefix, ///< Total number of selections prior to this tile - OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile - OffsetT num_selections) ///< Total number of selections including this tile - { - // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one - if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS))) - { - ScatterTwoPhase( - items, - selection_flags, - selection_indices, - num_tile_items, - num_tile_selections, - num_selections_prefix, - num_rejected_prefix, - Int2Type()); - } - else - { - ScatterDirect( - items, - selection_flags, - selection_indices, - num_selections); - } - } - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - - /** - * Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile) - */ - template - __device__ __forceinline__ OffsetT ConsumeFirstTile( - int num_tile_items, ///< Number of input items comprising this tile - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - OutputT items[ITEMS_PER_THREAD]; - OffsetT selection_flags[ITEMS_PER_THREAD]; - OffsetT selection_indices[ITEMS_PER_THREAD]; - - // Load items - if (IS_LAST_TILE) - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); - else - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); - - // Initialize selection_flags - InitializeSelections( - tile_offset, - num_tile_items, - items, - selection_flags, - Int2Type()); - - CTA_SYNC(); - - // Exclusive scan of selection_flags - OffsetT num_tile_selections; - BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections); - - if (threadIdx.x == 0) - { - // Update tile status if this is not the last tile - if (!IS_LAST_TILE) - tile_state.SetInclusive(0, num_tile_selections); - } - - // Discount any out-of-bounds selections - if (IS_LAST_TILE) - num_tile_selections -= (TILE_ITEMS - num_tile_items); - - // Scatter flagged items - Scatter( - items, - selection_flags, - selection_indices, - num_tile_items, - num_tile_selections, - 0, - 0, - num_tile_selections); - - return num_tile_selections; - } - - - /** - * Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile) - */ - template - __device__ __forceinline__ OffsetT ConsumeSubsequentTile( - int num_tile_items, ///< Number of input items comprising this tile - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - OutputT items[ITEMS_PER_THREAD]; - OffsetT selection_flags[ITEMS_PER_THREAD]; - OffsetT selection_indices[ITEMS_PER_THREAD]; - - // Load items - if (IS_LAST_TILE) - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); - else - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); - - // Initialize selection_flags - InitializeSelections( - tile_offset, - num_tile_items, - items, - selection_flags, - Int2Type()); - - CTA_SYNC(); - - // Exclusive scan of values and selection_flags - TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx); - BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op); - - OffsetT num_tile_selections = prefix_op.GetBlockAggregate(); - OffsetT num_selections = prefix_op.GetInclusivePrefix(); - OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix(); - OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - num_selections_prefix; - - // Discount any out-of-bounds selections - if (IS_LAST_TILE) - { - int num_discount = TILE_ITEMS - num_tile_items; - num_selections -= num_discount; - num_tile_selections -= num_discount; - } - - // Scatter flagged items - Scatter( - items, - selection_flags, - selection_indices, - num_tile_items, - num_tile_selections, - num_selections_prefix, - num_rejected_prefix, - num_selections); - - return num_selections; - } - - - /** - * Process a tile of input - */ - template - __device__ __forceinline__ OffsetT ConsumeTile( - int num_tile_items, ///< Number of input items comprising this tile - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - OffsetT num_selections; - if (tile_idx == 0) - { - num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state); - } - else - { - num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state); - } - - return num_selections; - } - - - /** - * Scan tiles of items as part of a dynamic chained scan - */ - template ///< Output iterator type for recording number of items selection_flags - __device__ __forceinline__ void ConsumeRange( - int num_tiles, ///< Total number of input tiles - ScanTileStateT& tile_state, ///< Global tile state descriptor - NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags - { - // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile - - if (tile_idx < num_tiles - 1) - { - // Not the last tile (full) - ConsumeTile(TILE_ITEMS, tile_idx, tile_offset, tile_state); - } - else - { - // The last tile (possibly partially-full) - OffsetT num_remaining = num_items - tile_offset; - OffsetT num_selections = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); - - if (threadIdx.x == 0) - { - // Output the total number of items selection_flags - *d_num_selected_out = num_selections; - } - } - } - -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_spmv_orig.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_spmv_orig.cuh deleted file mode 100644 index 3168554..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/agent_spmv_orig.cuh +++ /dev/null @@ -1,925 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. - */ - -#pragma once - -#include - -#include "../util_type.cuh" -#include "../block/block_reduce.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_exchange.cuh" -#include "../thread/thread_search.cuh" -#include "../thread/thread_operators.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../iterator/counting_input_iterator.cuh" -#include "../iterator/tex_ref_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentSpmv - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets during search - CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets - CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR column-indices - CacheLoadModifier _VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR values - CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading vector values - bool _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory) - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentSpmvPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory) - }; - - static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets - static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets - static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR column-indices - static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR values - static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading vector values - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use - -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -template < - typename ValueT, ///< Matrix and vector value type - typename OffsetT> ///< Signed integer type for sequence offsets -struct SpmvParams -{ - ValueT* d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. - OffsetT* d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values - OffsetT* d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) - ValueT* d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector x - ValueT* d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector y - int num_rows; ///< Number of rows of matrix A. - int num_cols; ///< Number of columns of matrix A. - int num_nonzeros; ///< Number of nonzero elements of matrix A. - ValueT alpha; ///< Alpha multiplicand - ValueT beta; ///< Beta addend-multiplicand - - TexRefInputIterator t_vector_x; -}; - - -/** - * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. - */ -template < - typename AgentSpmvPolicyT, ///< Parameterized AgentSpmvPolicy tuning policy type - typename ValueT, ///< Matrix and vector value type - typename OffsetT, ///< Signed integer type for sequence offsets - bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1 - bool HAS_BETA, ///< Whether the input parameter \p beta is 0 - int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability -struct AgentSpmv -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - /// 2D merge path coordinate type - typedef typename CubVector::Type CoordinateT; - - /// Input iterator wrapper types (for applying cache modifiers) - - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, - OffsetT, - OffsetT> - RowOffsetsSearchIteratorT; - - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER, - OffsetT, - OffsetT> - RowOffsetsIteratorT; - - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER, - OffsetT, - OffsetT> - ColumnIndicesIteratorT; - - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::VALUES_LOAD_MODIFIER, - ValueT, - OffsetT> - ValueIteratorT; - - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, - ValueT, - OffsetT> - VectorValueIteratorT; - - // Tuple type for scanning (pairs accumulated segment-value with segment-index) - typedef KeyValuePair KeyValuePairT; - - // Reduce-value-by-segment scan operator - typedef ReduceByKeyOp ReduceBySegmentOpT; - - // BlockReduce specialization - typedef BlockReduce< - ValueT, - BLOCK_THREADS, - BLOCK_REDUCE_WARP_REDUCTIONS> - BlockReduceT; - - // BlockScan specialization - typedef BlockScan< - KeyValuePairT, - BLOCK_THREADS, - AgentSpmvPolicyT::SCAN_ALGORITHM> - BlockScanT; - - // BlockScan specialization - typedef BlockScan< - ValueT, - BLOCK_THREADS, - AgentSpmvPolicyT::SCAN_ALGORITHM> - BlockPrefixSumT; - - // BlockExchange specialization - typedef BlockExchange< - ValueT, - BLOCK_THREADS, - ITEMS_PER_THREAD> - BlockExchangeT; - - /// Merge item type (either a non-zero value or a row-end offset) - union MergeItem - { - // Value type to pair with index type OffsetT (NullType if loading values directly during merge) - typedef typename If::Type MergeValueT; - - OffsetT row_end_offset; - MergeValueT nonzero; - }; - - /// Shared memory type required by this thread block - struct _TempStorage - { - CoordinateT tile_coords[2]; - - union Aliasable - { - // Smem needed for tile of merge items - MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1]; - - // Smem needed for block exchange - typename BlockExchangeT::TempStorage exchange; - - // Smem needed for block-wide reduction - typename BlockReduceT::TempStorage reduce; - - // Smem needed for tile scanning - typename BlockScanT::TempStorage scan; - - // Smem needed for tile prefix sum - typename BlockPrefixSumT::TempStorage prefix_sum; - - } aliasable; - }; - - /// Temporary storage type (unionable) - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - - _TempStorage& temp_storage; /// Reference to temp_storage - - SpmvParams& spmv_params; - - ValueIteratorT wd_values; ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. - RowOffsetsIteratorT wd_row_end_offsets; ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values - ColumnIndicesIteratorT wd_column_indices; ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) - VectorValueIteratorT wd_vector_x; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x - VectorValueIteratorT wd_vector_y; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ AgentSpmv( - TempStorage& temp_storage, ///< Reference to temp_storage - SpmvParams& spmv_params) ///< SpMV input parameter bundle - : - temp_storage(temp_storage.Alias()), - spmv_params(spmv_params), - wd_values(spmv_params.d_values), - wd_row_end_offsets(spmv_params.d_row_end_offsets), - wd_column_indices(spmv_params.d_column_indices), - wd_vector_x(spmv_params.d_vector_x), - wd_vector_y(spmv_params.d_vector_y) - {} - - - - - /** - * Consume a merge tile, specialized for direct-load of nonzeros - */ - __device__ __forceinline__ KeyValuePairT ConsumeTile( - int tile_idx, - CoordinateT tile_start_coord, - CoordinateT tile_end_coord, - Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; - - // Gather the row end-offsets for the merge tile into shared memory - for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) - { - s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; - } - - CTA_SYNC(); - - // Search for the thread's starting coordinate within the merge tile - CountingInputIterator tile_nonzero_indices(tile_start_coord.y); - CoordinateT thread_start_coord; - - MergePathSearch( - OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal - s_tile_row_end_offsets, // List A - tile_nonzero_indices, // List B - tile_num_rows, - tile_num_nonzeros, - thread_start_coord); - - CTA_SYNC(); // Perf-sync - - // Compute the thread's merge path segment - CoordinateT thread_current_coord = thread_start_coord; - KeyValuePairT scan_segment[ITEMS_PER_THREAD]; - - ValueT running_total = 0.0; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1); - OffsetT column_idx = wd_column_indices[nonzero_idx]; - ValueT value = wd_values[nonzero_idx]; - - ValueT vector_value = spmv_params.t_vector_x[column_idx]; -#if (CUB_PTX_ARCH >= 350) - vector_value = wd_vector_x[column_idx]; -#endif - ValueT nonzero = value * vector_value; - - OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - - if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) - { - // Move down (accumulate) - running_total += nonzero; - scan_segment[ITEM].value = running_total; - scan_segment[ITEM].key = tile_num_rows; - ++thread_current_coord.y; - } - else - { - // Move right (reset) - scan_segment[ITEM].value = running_total; - scan_segment[ITEM].key = thread_current_coord.x; - running_total = 0.0; - ++thread_current_coord.x; - } - } - - CTA_SYNC(); - - // Block-wide reduce-value-by-segment - KeyValuePairT tile_carry; - ReduceBySegmentOpT scan_op; - KeyValuePairT scan_item; - - scan_item.value = running_total; - scan_item.key = thread_current_coord.x; - - BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); - - if (tile_num_rows > 0) - { - if (threadIdx.x == 0) - scan_item.key = -1; - - // Direct scatter - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (scan_segment[ITEM].key < tile_num_rows) - { - if (scan_item.key == scan_segment[ITEM].key) - scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value; - - if (HAS_ALPHA) - { - scan_segment[ITEM].value *= spmv_params.alpha; - } - - if (HAS_BETA) - { - // Update the output vector element - ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key]; - scan_segment[ITEM].value += addend; - } - - // Set the output vector element - spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value; - } - } - } - - // Return the tile's running carry-out - return tile_carry; - } - - - - /** - * Consume a merge tile, specialized for indirect load of nonzeros - */ - __device__ __forceinline__ KeyValuePairT ConsumeTile( - int tile_idx, - CoordinateT tile_start_coord, - CoordinateT tile_end_coord, - Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - -#if (CUB_PTX_ARCH >= 520) - -/* - OffsetT* s_tile_row_end_offsets = &temp_storage.merge_items[tile_num_nonzeros].row_end_offset; - ValueT* s_tile_nonzeros = &temp_storage.merge_items[0].nonzero; - - OffsetT col_indices[ITEMS_PER_THREAD]; - ValueT mat_values[ITEMS_PER_THREAD]; - int nonzero_indices[ITEMS_PER_THREAD]; - - // Gather the nonzeros for the merge tile into shared memory - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - nonzero_indices[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS); - - ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_indices[ITEM]; - ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_indices[ITEM]; - - col_indices[ITEM] = (nonzero_indices[ITEM] < tile_num_nonzeros) ? *ci : 0; - mat_values[ITEM] = (nonzero_indices[ITEM] < tile_num_nonzeros) ? *a : 0.0; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - VectorValueIteratorT x = wd_vector_x + col_indices[ITEM]; - mat_values[ITEM] *= *x; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - ValueT *s = s_tile_nonzeros + nonzero_indices[ITEM]; - - *s = mat_values[ITEM]; - } - - CTA_SYNC(); - -*/ - - OffsetT* s_tile_row_end_offsets = &temp_storage.merge_items[0].row_end_offset; - ValueT* s_tile_nonzeros = &temp_storage.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; - - // Gather the nonzeros for the merge tile into shared memory - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); - - ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx; - ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx; - ValueT* s = s_tile_nonzeros + nonzero_idx; - - if (nonzero_idx < tile_num_nonzeros) - { - - OffsetT column_idx = *ci; - ValueT value = *a; - - ValueT vector_value = spmv_params.t_vector_x[column_idx]; - vector_value = wd_vector_x[column_idx]; - - ValueT nonzero = value * vector_value; - - *s = nonzero; - } - } - - -#else - - OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; - ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; - - // Gather the nonzeros for the merge tile into shared memory - if (tile_num_nonzeros > 0) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); - nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); - - OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; - ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; - - ValueT vector_value = spmv_params.t_vector_x[column_idx]; -#if (CUB_PTX_ARCH >= 350) - vector_value = wd_vector_x[column_idx]; -#endif - ValueT nonzero = value * vector_value; - - s_tile_nonzeros[nonzero_idx] = nonzero; - } - } - -#endif - - // Gather the row end-offsets for the merge tile into shared memory - #pragma unroll 1 - for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) - { - s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; - } - - CTA_SYNC(); - - // Search for the thread's starting coordinate within the merge tile - CountingInputIterator tile_nonzero_indices(tile_start_coord.y); - CoordinateT thread_start_coord; - - MergePathSearch( - OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal - s_tile_row_end_offsets, // List A - tile_nonzero_indices, // List B - tile_num_rows, - tile_num_nonzeros, - thread_start_coord); - - CTA_SYNC(); // Perf-sync - - // Compute the thread's merge path segment - CoordinateT thread_current_coord = thread_start_coord; - KeyValuePairT scan_segment[ITEMS_PER_THREAD]; - ValueT running_total = 0.0; - - OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - ValueT nonzero = s_tile_nonzeros[thread_current_coord.y]; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) - { - // Move down (accumulate) - scan_segment[ITEM].value = nonzero; - running_total += nonzero; - ++thread_current_coord.y; - nonzero = s_tile_nonzeros[thread_current_coord.y]; - } - else - { - // Move right (reset) - scan_segment[ITEM].value = 0.0; - running_total = 0.0; - ++thread_current_coord.x; - row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - } - - scan_segment[ITEM].key = thread_current_coord.x; - } - - CTA_SYNC(); - - // Block-wide reduce-value-by-segment - KeyValuePairT tile_carry; - ReduceBySegmentOpT scan_op; - KeyValuePairT scan_item; - - scan_item.value = running_total; - scan_item.key = thread_current_coord.x; - - BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); - - if (threadIdx.x == 0) - { - scan_item.key = thread_start_coord.x; - scan_item.value = 0.0; - } - - if (tile_num_rows > 0) - { - - CTA_SYNC(); - - // Scan downsweep and scatter - ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero; - - if (scan_item.key != scan_segment[0].key) - { - s_partials[scan_item.key] = scan_item.value; - } - else - { - scan_segment[0].value += scan_item.value; - } - - #pragma unroll - for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key) - { - s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value; - } - else - { - scan_segment[ITEM].value += scan_segment[ITEM - 1].value; - } - } - - CTA_SYNC(); - - #pragma unroll 1 - for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) - { - spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item]; - } - } - - // Return the tile's running carry-out - return tile_carry; - } - - - - - - - - /** - * Consume a merge tile, specialized for indirect load of nonzeros - * / - template - __device__ __forceinline__ KeyValuePairT ConsumeTile1( - int tile_idx, - CoordinateT tile_start_coord, - CoordinateT tile_end_coord, - IsDirectLoadT is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - - OffsetT* s_tile_row_end_offsets = &temp_storage.merge_items[0].row_end_offset; - - int warp_idx = threadIdx.x / WARP_THREADS; - int lane_idx = LaneId(); - - // Gather the row end-offsets for the merge tile into shared memory - #pragma unroll 1 - for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) - { - s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; - } - - CTA_SYNC(); - - // Search for warp start/end coords - if (lane_idx == 0) - { - MergePathSearch( - OffsetT(warp_idx * ITEMS_PER_WARP), // Diagonal - s_tile_row_end_offsets, // List A - CountingInputIterator(tile_start_coord.y), // List B - tile_num_rows, - tile_num_nonzeros, - temp_storage.warp_coords[warp_idx]); - - CoordinateT last = {tile_num_rows, tile_num_nonzeros}; - temp_storage.warp_coords[WARPS] = last; - } - - CTA_SYNC(); - - CoordinateT warp_coord = temp_storage.warp_coords[warp_idx]; - CoordinateT warp_end_coord = temp_storage.warp_coords[warp_idx + 1]; - OffsetT warp_nonzero_idx = tile_start_coord.y + warp_coord.y; - - // Consume whole rows - #pragma unroll 1 - for (; warp_coord.x < warp_end_coord.x; ++warp_coord.x) - { - ValueT row_total = 0.0; - OffsetT row_end_offset = s_tile_row_end_offsets[warp_coord.x]; - - #pragma unroll 1 - for (OffsetT nonzero_idx = warp_nonzero_idx + lane_idx; - nonzero_idx < row_end_offset; - nonzero_idx += WARP_THREADS) - { - OffsetT column_idx = wd_column_indices[nonzero_idx]; - ValueT value = wd_values[nonzero_idx]; - ValueT vector_value = wd_vector_x[column_idx]; - row_total += value * vector_value; - } - - // Warp reduce - row_total = WarpReduceT(temp_storage.warp_reduce[warp_idx]).Sum(row_total); - - // Output - if (lane_idx == 0) - { - spmv_params.d_vector_y[tile_start_coord.x + warp_coord.x] = row_total; - } - - warp_nonzero_idx = row_end_offset; - } - - // Consume partial portion of thread's last row - if (warp_nonzero_idx < tile_start_coord.y + warp_end_coord.y) - { - ValueT row_total = 0.0; - for (OffsetT nonzero_idx = warp_nonzero_idx + lane_idx; - nonzero_idx < tile_start_coord.y + warp_end_coord.y; - nonzero_idx += WARP_THREADS) - { - - OffsetT column_idx = wd_column_indices[nonzero_idx]; - ValueT value = wd_values[nonzero_idx]; - ValueT vector_value = wd_vector_x[column_idx]; - row_total += value * vector_value; - } - - // Warp reduce - row_total = WarpReduceT(temp_storage.warp_reduce[warp_idx]).Sum(row_total); - - // Output - if (lane_idx == 0) - { - spmv_params.d_vector_y[tile_start_coord.x + warp_coord.x] = row_total; - } - } - - // Return the tile's running carry-out - KeyValuePairT tile_carry(tile_num_rows, 0.0); - return tile_carry; - } -*/ - - - - - - - - /** - * Consume a merge tile, specialized for indirect load of nonzeros - * / - __device__ __forceinline__ KeyValuePairT ConsumeTile2( - int tile_idx, - CoordinateT tile_start_coord, - CoordinateT tile_end_coord, - Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - - ValueT* s_tile_nonzeros = &temp_storage.merge_items[0].nonzero; - - ValueT nonzeros[ITEMS_PER_THREAD]; - - // Gather the nonzeros for the merge tile into shared memory - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); - nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); - - OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; - ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; - - ValueT vector_value = spmv_params.t_vector_x[column_idx]; -#if (CUB_PTX_ARCH >= 350) - vector_value = wd_vector_x[column_idx]; -#endif - - nonzeros[ITEM] = value * vector_value; - } - - // Exchange striped->blocked - BlockExchangeT(temp_storage.exchange).StripedToBlocked(nonzeros); - - CTA_SYNC(); - - // Compute an inclusive prefix sum - BlockPrefixSumT(temp_storage.prefix_sum).InclusiveSum(nonzeros, nonzeros); - - CTA_SYNC(); - - if (threadIdx.x == 0) - s_tile_nonzeros[0] = 0.0; - - // Scatter back to smem - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM + 1; - s_tile_nonzeros[item_idx] = nonzeros[ITEM]; - } - - CTA_SYNC(); - - // Gather the row end-offsets for the merge tile into shared memory - #pragma unroll 1 - for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) - { - OffsetT start = CUB_MAX(wd_row_end_offsets[tile_start_coord.x + item - 1], tile_start_coord.y); - OffsetT end = wd_row_end_offsets[tile_start_coord.x + item]; - - start -= tile_start_coord.y; - end -= tile_start_coord.y; - - ValueT row_partial = s_tile_nonzeros[end] - s_tile_nonzeros[start]; - - spmv_params.d_vector_y[tile_start_coord.x + item] = row_partial; - } - - // Get the tile's carry-out - KeyValuePairT tile_carry; - if (threadIdx.x == 0) - { - tile_carry.key = tile_num_rows; - - OffsetT start = CUB_MAX(wd_row_end_offsets[tile_end_coord.x - 1], tile_start_coord.y); - start -= tile_start_coord.y; - OffsetT end = tile_num_nonzeros; - - tile_carry.value = s_tile_nonzeros[end] - s_tile_nonzeros[start]; - } - - // Return the tile's running carry-out - return tile_carry; - } -*/ - - - /** - * Consume input tile - */ - __device__ __forceinline__ void ConsumeTile( - CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates - KeyValuePairT* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block - int num_merge_tiles) ///< [in] Number of merge tiles - { - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - - if (tile_idx >= num_merge_tiles) - return; - - // Read our starting coordinates - if (threadIdx.x < 2) - { - if (d_tile_coordinates == NULL) - { - // Search our starting coordinates - OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS; - CoordinateT tile_coord; - CountingInputIterator nonzero_indices(0); - - // Search the merge path - MergePathSearch( - diagonal, - RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), - nonzero_indices, - spmv_params.num_rows, - spmv_params.num_nonzeros, - tile_coord); - - temp_storage.tile_coords[threadIdx.x] = tile_coord; - } - else - { - temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x]; - } - } - - CTA_SYNC(); - - CoordinateT tile_start_coord = temp_storage.tile_coords[0]; - CoordinateT tile_end_coord = temp_storage.tile_coords[1]; - - // Consume multi-segment tile - KeyValuePairT tile_carry = ConsumeTile( - tile_idx, - tile_start_coord, - tile_end_coord, - Int2Type()); - - // Output the tile's carry-out - if (threadIdx.x == 0) - { - if (HAS_ALPHA) - tile_carry.value *= spmv_params.alpha; - - tile_carry.key += tile_start_coord.x; - d_tile_carry_pairs[tile_idx] = tile_carry; - } - } - - -}; - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/single_pass_scan_operators.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/single_pass_scan_operators.cuh deleted file mode 100644 index 2f67137..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/agent/single_pass_scan_operators.cuh +++ /dev/null @@ -1,815 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Callback operator types for supplying BlockScan prefixes - */ - -#pragma once - -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../warp/warp_reduce.cuh" -#include "../util_arch.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Prefix functor type for maintaining a running prefix while scanning a - * region independent of other thread blocks - ******************************************************************************/ - -/** - * Stateful callback operator type for supplying BlockScan prefixes. - * Maintains a running prefix that can be applied to consecutive - * BlockScan operations. - */ -template < - typename T, ///< BlockScan value type - typename ScanOpT> ///< Wrapped scan operator type -struct BlockScanRunningPrefixOp -{ - ScanOpT op; ///< Wrapped scan operator - T running_total; ///< Running block-wide prefix - - /// Constructor - __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op) - : - op(op) - {} - - /// Constructor - __device__ __forceinline__ BlockScanRunningPrefixOp( - T starting_prefix, - ScanOpT op) - : - op(op), - running_total(starting_prefix) - {} - - /** - * Prefix callback operator. Returns the block-wide running_total in thread-0. - */ - __device__ __forceinline__ T operator()( - const T &block_aggregate) ///< The aggregate sum of the BlockScan inputs - { - T retval = running_total; - running_total = op(running_total, block_aggregate); - return retval; - } -}; - - -/****************************************************************************** - * Generic tile status interface types for block-cooperative scans - ******************************************************************************/ - -/** - * Enumerations of tile status - */ -enum ScanTileStatus -{ - SCAN_TILE_OOB, // Out-of-bounds (e.g., padding) - SCAN_TILE_INVALID = 99, // Not yet processed - SCAN_TILE_PARTIAL, // Tile aggregate is available - SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available -}; - - -/** - * Tile status interface. - */ -template < - typename T, - bool SINGLE_WORD = Traits::PRIMITIVE> -struct ScanTileState; - - -/** - * Tile status interface specialized for scan status and value types - * that can be combined into one machine word that can be - * read/written coherently in a single access. - */ -template -struct ScanTileState -{ - // Status word type - typedef typename If<(sizeof(T) == 8), - long long, - typename If<(sizeof(T) == 4), - int, - typename If<(sizeof(T) == 2), - short, - char>::Type>::Type>::Type StatusWord; - - - // Unit word type - typedef typename If<(sizeof(T) == 8), - longlong2, - typename If<(sizeof(T) == 4), - int2, - typename If<(sizeof(T) == 2), - int, - uchar2>::Type>::Type>::Type TxnWord; - - - // Device word type - struct TileDescriptor - { - StatusWord status; - T value; - }; - - - // Constants - enum - { - TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, - }; - - - // Device storage - TxnWord *d_tile_descriptors; - - /// Constructor - __host__ __device__ __forceinline__ - ScanTileState() - : - d_tile_descriptors(NULL) - {} - - - /// Initializer - __host__ __device__ __forceinline__ - cudaError_t Init( - int /*num_tiles*/, ///< [in] Number of tiles - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation - { - d_tile_descriptors = reinterpret_cast(d_temp_storage); - return cudaSuccess; - } - - - /** - * Compute device memory needed for tile status - */ - __host__ __device__ __forceinline__ - static cudaError_t AllocationSize( - int num_tiles, ///< [in] Number of tiles - size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation - { - temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors - return cudaSuccess; - } - - - /** - * Initialize (from device) - */ - __device__ __forceinline__ void InitializeStatus(int num_tiles) - { - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - - TxnWord val = TxnWord(); - TileDescriptor *descriptor = reinterpret_cast(&val); - - if (tile_idx < num_tiles) - { - // Not-yet-set - descriptor->status = StatusWord(SCAN_TILE_INVALID); - d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; - } - - if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) - { - // Padding - descriptor->status = StatusWord(SCAN_TILE_OOB); - d_tile_descriptors[threadIdx.x] = val; - } - } - - - /** - * Update the specified tile's inclusive value and corresponding status - */ - __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) - { - TileDescriptor tile_descriptor; - tile_descriptor.status = SCAN_TILE_INCLUSIVE; - tile_descriptor.value = tile_inclusive; - - TxnWord alias; - *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); - } - - - /** - * Update the specified tile's partial value and corresponding status - */ - __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) - { - TileDescriptor tile_descriptor; - tile_descriptor.status = SCAN_TILE_PARTIAL; - tile_descriptor.value = tile_partial; - - TxnWord alias; - *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); - } - - /** - * Wait for the corresponding tile to become non-invalid - */ - __device__ __forceinline__ void WaitForValid( - int tile_idx, - StatusWord &status, - T &value) - { - TileDescriptor tile_descriptor; - do - { - __threadfence_block(); // prevent hoisting loads from loop - TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); - tile_descriptor = reinterpret_cast(alias); - - } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); - - status = tile_descriptor.status; - value = tile_descriptor.value; - } - -}; - - - -/** - * Tile status interface specialized for scan status and value types that - * cannot be combined into one machine word. - */ -template -struct ScanTileState -{ - // Status word type - typedef char StatusWord; - - // Constants - enum - { - TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, - }; - - // Device storage - StatusWord *d_tile_status; - T *d_tile_partial; - T *d_tile_inclusive; - - /// Constructor - __host__ __device__ __forceinline__ - ScanTileState() - : - d_tile_status(NULL), - d_tile_partial(NULL), - d_tile_inclusive(NULL) - {} - - - /// Initializer - __host__ __device__ __forceinline__ - cudaError_t Init( - int num_tiles, ///< [in] Number of tiles - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation - { - cudaError_t error = cudaSuccess; - do - { - void* allocations[3]; - size_t allocation_sizes[3]; - - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives - - // Compute allocation pointers into the single storage blob - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - - // Alias the offsets - d_tile_status = reinterpret_cast(allocations[0]); - d_tile_partial = reinterpret_cast(allocations[1]); - d_tile_inclusive = reinterpret_cast(allocations[2]); - } - while (0); - - return error; - } - - - /** - * Compute device memory needed for tile status - */ - __host__ __device__ __forceinline__ - static cudaError_t AllocationSize( - int num_tiles, ///< [in] Number of tiles - size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation - { - // Specify storage allocation requirements - size_t allocation_sizes[3]; - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives - - // Set the necessary size of the blob - void* allocations[3]; - return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes)); - } - - - /** - * Initialize (from device) - */ - __device__ __forceinline__ void InitializeStatus(int num_tiles) - { - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tile_idx < num_tiles) - { - // Not-yet-set - d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID); - } - - if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) - { - // Padding - d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB); - } - } - - - /** - * Update the specified tile's inclusive value and corresponding status - */ - __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) - { - // Update tile inclusive value - ThreadStore(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive); - - // Fence - __threadfence(); - - // Update tile status - ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); - } - - - /** - * Update the specified tile's partial value and corresponding status - */ - __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) - { - // Update tile partial value - ThreadStore(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial); - - // Fence - __threadfence(); - - // Update tile status - ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); - } - - /** - * Wait for the corresponding tile to become non-invalid - */ - __device__ __forceinline__ void WaitForValid( - int tile_idx, - StatusWord &status, - T &value) - { - do { - status = ThreadLoad(d_tile_status + TILE_STATUS_PADDING + tile_idx); - - __threadfence(); // prevent hoisting loads from loop or loads below above this one - - } while (status == SCAN_TILE_INVALID); - - if (status == StatusWord(SCAN_TILE_PARTIAL)) - value = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx); - else - value = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); - } -}; - - -/****************************************************************************** - * ReduceByKey tile status interface types for block-cooperative scans - ******************************************************************************/ - -/** - * Tile status interface for reduction by key. - * - */ -template < - typename ValueT, - typename KeyT, - bool SINGLE_WORD = (Traits::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)> -struct ReduceByKeyScanTileState; - - -/** - * Tile status interface for reduction by key, specialized for scan status and value types that - * cannot be combined into one machine word. - */ -template < - typename ValueT, - typename KeyT> -struct ReduceByKeyScanTileState : - ScanTileState > -{ - typedef ScanTileState > SuperClass; - - /// Constructor - __host__ __device__ __forceinline__ - ReduceByKeyScanTileState() : SuperClass() {} -}; - - -/** - * Tile status interface for reduction by key, specialized for scan status and value types that - * can be combined into one machine word that can be read/written coherently in a single access. - */ -template < - typename ValueT, - typename KeyT> -struct ReduceByKeyScanTileState -{ - typedef KeyValuePairKeyValuePairT; - - // Constants - enum - { - PAIR_SIZE = sizeof(ValueT) + sizeof(KeyT), - TXN_WORD_SIZE = 1 << Log2::VALUE, - STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE, - - TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, - }; - - // Status word type - typedef typename If<(STATUS_WORD_SIZE == 8), - long long, - typename If<(STATUS_WORD_SIZE == 4), - int, - typename If<(STATUS_WORD_SIZE == 2), - short, - char>::Type>::Type>::Type StatusWord; - - // Status word type - typedef typename If<(TXN_WORD_SIZE == 16), - longlong2, - typename If<(TXN_WORD_SIZE == 8), - long long, - int>::Type>::Type TxnWord; - - // Device word type (for when sizeof(ValueT) == sizeof(KeyT)) - struct TileDescriptorBigStatus - { - KeyT key; - ValueT value; - StatusWord status; - }; - - // Device word type (for when sizeof(ValueT) != sizeof(KeyT)) - struct TileDescriptorLittleStatus - { - ValueT value; - StatusWord status; - KeyT key; - }; - - // Device word type - typedef typename If< - (sizeof(ValueT) == sizeof(KeyT)), - TileDescriptorBigStatus, - TileDescriptorLittleStatus>::Type - TileDescriptor; - - - // Device storage - TxnWord *d_tile_descriptors; - - - /// Constructor - __host__ __device__ __forceinline__ - ReduceByKeyScanTileState() - : - d_tile_descriptors(NULL) - {} - - - /// Initializer - __host__ __device__ __forceinline__ - cudaError_t Init( - int /*num_tiles*/, ///< [in] Number of tiles - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation - { - d_tile_descriptors = reinterpret_cast(d_temp_storage); - return cudaSuccess; - } - - - /** - * Compute device memory needed for tile status - */ - __host__ __device__ __forceinline__ - static cudaError_t AllocationSize( - int num_tiles, ///< [in] Number of tiles - size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation - { - temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors - return cudaSuccess; - } - - - /** - * Initialize (from device) - */ - __device__ __forceinline__ void InitializeStatus(int num_tiles) - { - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - TxnWord val = TxnWord(); - TileDescriptor *descriptor = reinterpret_cast(&val); - - if (tile_idx < num_tiles) - { - // Not-yet-set - descriptor->status = StatusWord(SCAN_TILE_INVALID); - d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; - } - - if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) - { - // Padding - descriptor->status = StatusWord(SCAN_TILE_OOB); - d_tile_descriptors[threadIdx.x] = val; - } - } - - - /** - * Update the specified tile's inclusive value and corresponding status - */ - __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive) - { - TileDescriptor tile_descriptor; - tile_descriptor.status = SCAN_TILE_INCLUSIVE; - tile_descriptor.value = tile_inclusive.value; - tile_descriptor.key = tile_inclusive.key; - - TxnWord alias; - *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); - } - - - /** - * Update the specified tile's partial value and corresponding status - */ - __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial) - { - TileDescriptor tile_descriptor; - tile_descriptor.status = SCAN_TILE_PARTIAL; - tile_descriptor.value = tile_partial.value; - tile_descriptor.key = tile_partial.key; - - TxnWord alias; - *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); - } - - /** - * Wait for the corresponding tile to become non-invalid - */ - __device__ __forceinline__ void WaitForValid( - int tile_idx, - StatusWord &status, - KeyValuePairT &value) - { -// TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); -// TileDescriptor tile_descriptor = reinterpret_cast(alias); -// -// while (tile_descriptor.status == SCAN_TILE_INVALID) -// { -// __threadfence_block(); // prevent hoisting loads from loop -// -// alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); -// tile_descriptor = reinterpret_cast(alias); -// } -// -// status = tile_descriptor.status; -// value.value = tile_descriptor.value; -// value.key = tile_descriptor.key; - - TileDescriptor tile_descriptor; - do - { - __threadfence_block(); // prevent hoisting loads from loop - TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); - tile_descriptor = reinterpret_cast(alias); - - } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); - - status = tile_descriptor.status; - value.value = tile_descriptor.value; - value.key = tile_descriptor.key; - } - -}; - - -/****************************************************************************** - * Prefix call-back operator for coupling local block scan within a - * block-cooperative scan - ******************************************************************************/ - -/** - * Stateful block-scan prefix functor. Provides the the running prefix for - * the current tile by using the call-back warp to wait on on - * aggregates/prefixes from predecessor tiles to become available. - */ -template < - typename T, - typename ScanOpT, - typename ScanTileStateT, - int PTX_ARCH = CUB_PTX_ARCH> -struct TilePrefixCallbackOp -{ - // Parameterized warp reduce - typedef WarpReduce WarpReduceT; - - // Temporary storage type - struct _TempStorage - { - typename WarpReduceT::TempStorage warp_reduce; - T exclusive_prefix; - T inclusive_prefix; - T block_aggregate; - }; - - // Alias wrapper allowing temporary storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - // Type of status word - typedef typename ScanTileStateT::StatusWord StatusWord; - - // Fields - _TempStorage& temp_storage; ///< Reference to a warp-reduction instance - ScanTileStateT& tile_status; ///< Interface to tile status - ScanOpT scan_op; ///< Binary scan operator - int tile_idx; ///< The current tile index - T exclusive_prefix; ///< Exclusive prefix for the tile - T inclusive_prefix; ///< Inclusive prefix for the tile - - // Constructor - __device__ __forceinline__ - TilePrefixCallbackOp( - ScanTileStateT &tile_status, - TempStorage &temp_storage, - ScanOpT scan_op, - int tile_idx) - : - temp_storage(temp_storage.Alias()), - tile_status(tile_status), - scan_op(scan_op), - tile_idx(tile_idx) {} - - - // Block until all predecessors within the warp-wide window have non-invalid status - __device__ __forceinline__ - void ProcessWindow( - int predecessor_idx, ///< Preceding tile index to inspect - StatusWord &predecessor_status, ///< [out] Preceding tile status - T &window_aggregate) ///< [out] Relevant partial reduction from this window of preceding tiles - { - T value; - tile_status.WaitForValid(predecessor_idx, predecessor_status, value); - - // Perform a segmented reduction to get the prefix for the current window. - // Use the swizzled scan operator because we are now scanning *down* towards thread0. - - int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE)); - window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce( - value, - tail_flag, - SwizzleScanOp(scan_op)); - } - - - // BlockScan prefix callback functor (called by the first warp) - __device__ __forceinline__ - T operator()(T block_aggregate) - { - - // Update our status with our tile-aggregate - if (threadIdx.x == 0) - { - temp_storage.block_aggregate = block_aggregate; - tile_status.SetPartial(tile_idx, block_aggregate); - } - - int predecessor_idx = tile_idx - threadIdx.x - 1; - StatusWord predecessor_status; - T window_aggregate; - - // Wait for the warp-wide window of predecessor tiles to become valid - ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); - - // The exclusive tile prefix starts out as the current window aggregate - exclusive_prefix = window_aggregate; - - // Keep sliding the window back until we come across a tile whose inclusive prefix is known - while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff)) - { - predecessor_idx -= CUB_PTX_WARP_THREADS; - - // Update exclusive tile prefix with the window prefix - ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); - exclusive_prefix = scan_op(window_aggregate, exclusive_prefix); - } - - // Compute the inclusive tile prefix and update the status for this tile - if (threadIdx.x == 0) - { - inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); - tile_status.SetInclusive(tile_idx, inclusive_prefix); - - temp_storage.exclusive_prefix = exclusive_prefix; - temp_storage.inclusive_prefix = inclusive_prefix; - } - - // Return exclusive_prefix - return exclusive_prefix; - } - - // Get the exclusive prefix stored in temporary storage - __device__ __forceinline__ - T GetExclusivePrefix() - { - return temp_storage.exclusive_prefix; - } - - // Get the inclusive prefix stored in temporary storage - __device__ __forceinline__ - T GetInclusivePrefix() - { - return temp_storage.inclusive_prefix; - } - - // Get the block aggregate stored in temporary storage - __device__ __forceinline__ - T GetBlockAggregate() - { - return temp_storage.block_aggregate; - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_adjacent_difference.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_adjacent_difference.cuh deleted file mode 100644 index 1125fe5..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_adjacent_difference.cuh +++ /dev/null @@ -1,596 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../util_type.cuh" -#include "../util_ptx.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -template < - typename T, - int BLOCK_DIM_X, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockAdjacentDifference -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - - /// Shared memory storage layout type (last element from each thread's input) - struct _TempStorage - { - T first_items[BLOCK_THREADS]; - T last_items[BLOCK_THREADS]; - }; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /// Specialization for when FlagOp has third index param - template ::HAS_PARAM> - struct ApplyOp - { - // Apply flag operator - static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx) - { - return flag_op(b, a, idx); - } - }; - - /// Specialization for when FlagOp does not have a third index param - template - struct ApplyOp - { - // Apply flag operator - static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) - { - return flag_op(b, a); - } - }; - - /// Templated unrolling of item comparison (inductive case) - template - struct Iterate - { - // Head flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagHeads( - int linear_tid, - FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - preds[ITERATION] = input[ITERATION - 1]; - - flags[ITERATION] = ApplyOp::FlagT( - flag_op, - preds[ITERATION], - input[ITERATION], - (linear_tid * ITEMS_PER_THREAD) + ITERATION); - - Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op); - } - - // Tail flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagTails( - int linear_tid, - FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - flags[ITERATION] = ApplyOp::FlagT( - flag_op, - input[ITERATION], - input[ITERATION + 1], - (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1); - - Iterate::FlagTails(linear_tid, flags, input, flag_op); - } - - }; - - /// Templated unrolling of item comparison (termination case) - template - struct Iterate - { - // Head flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagHeads( - int /*linear_tid*/, - FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate - {} - - // Tail flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagTails( - int /*linear_tid*/, - FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate - {} - }; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - -public: - - /// \smemstorage{BlockDiscontinuity} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockAdjacentDifference() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockAdjacentDifference( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Head flag operations - *********************************************************************/ - //@{ - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share last item - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - if (linear_tid == 0) - { - // Set flag for first thread-item (preds[0] is undefined) - head_flags[0] = 1; - } - else - { - preds[0] = temp_storage.last_items[linear_tid - 1]; - head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); - } - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - } - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - { - // Share last item - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - } - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - T preds[ITEMS_PER_THREAD]; - FlagHeads(head_flags, input, preds, flag_op); - } - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - { - T preds[ITEMS_PER_THREAD]; - FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); - } - - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagTails( - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first item - temp_storage.first_items[linear_tid] = input[0]; - - CTA_SYNC(); - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagTails( - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - { - // Share first item - temp_storage.first_items[linear_tid] = input[0]; - - CTA_SYNC(); - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = temp_storage.last_items[linear_tid - 1]; - if (linear_tid == 0) - { - head_flags[0] = 1; - } - else - { - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - } - - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - if (linear_tid == 0) - { - head_flags[0] = 1; - } - else - { - preds[0] = temp_storage.last_items[linear_tid - 1]; - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - } - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_discontinuity.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_discontinuity.cuh deleted file mode 100644 index 428882f..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_discontinuity.cuh +++ /dev/null @@ -1,1148 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../util_type.cuh" -#include "../util_ptx.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief The BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png) - * \ingroup BlockModule - * - * \tparam T The data type to be flagged. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items - * that differ from their predecessors (or successors). For example, head flags are convenient - * for demarcating disjoint data segments as part of a segmented scan or reduction. - * - \blocked - * - * \par Performance Considerations - * - \granularity - * - * \par A Simple Example - * \blockcollective{BlockDiscontinuity} - * \par - * The code snippet below illustrates the head flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute head flags for discontinuities in the segment - * int head_flags[4]; - * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. - * The corresponding output \p head_flags in those threads will be - * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * - * \par Performance Considerations - * - Incurs zero bank conflicts for most types - * - */ -template < - typename T, - int BLOCK_DIM_X, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockDiscontinuity -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - - /// Shared memory storage layout type (last element from each thread's input) - struct _TempStorage - { - T first_items[BLOCK_THREADS]; - T last_items[BLOCK_THREADS]; - }; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /// Specialization for when FlagOp has third index param - template ::HAS_PARAM> - struct ApplyOp - { - // Apply flag operator - static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx) - { - return flag_op(a, b, idx); - } - }; - - /// Specialization for when FlagOp does not have a third index param - template - struct ApplyOp - { - // Apply flag operator - static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) - { - return flag_op(a, b); - } - }; - - /// Templated unrolling of item comparison (inductive case) - template - struct Iterate - { - // Head flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagHeads( - int linear_tid, - FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - preds[ITERATION] = input[ITERATION - 1]; - - flags[ITERATION] = ApplyOp::FlagT( - flag_op, - preds[ITERATION], - input[ITERATION], - (linear_tid * ITEMS_PER_THREAD) + ITERATION); - - Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op); - } - - // Tail flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagTails( - int linear_tid, - FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - flags[ITERATION] = ApplyOp::FlagT( - flag_op, - input[ITERATION], - input[ITERATION + 1], - (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1); - - Iterate::FlagTails(linear_tid, flags, input, flag_op); - } - - }; - - /// Templated unrolling of item comparison (termination case) - template - struct Iterate - { - // Head flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagHeads( - int /*linear_tid*/, - FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate - {} - - // Tail flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagTails( - int /*linear_tid*/, - FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate - {} - }; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - -public: - - /// \smemstorage{BlockDiscontinuity} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockDiscontinuity() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockDiscontinuity( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Head flag operations - *********************************************************************/ - //@{ - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share last item - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - if (linear_tid == 0) - { - // Set flag for first thread-item (preds[0] is undefined) - head_flags[0] = 1; - } - else - { - preds[0] = temp_storage.last_items[linear_tid - 1]; - head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); - } - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - } - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - { - // Share last item - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - } - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - /** - * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is always flagged. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute head flags for discontinuities in the segment - * int head_flags[4]; - * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. - * The corresponding output \p head_flags in those threads will be - * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - T preds[ITEMS_PER_THREAD]; - FlagHeads(head_flags, input, preds, flag_op); - } - - - /** - * \brief Sets head flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is compared - * against \p tile_predecessor_item. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Have thread0 obtain the predecessor item for the entire tile - * int tile_predecessor_item; - * if (threadIdx.x == 0) tile_predecessor_item == ... - * - * // Collectively compute head flags for discontinuities in the segment - * int head_flags[4]; - * BlockDiscontinuity(temp_storage).FlagHeads( - * head_flags, thread_data, cub::Inequality(), tile_predecessor_item); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }, - * and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those threads will be - * { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - { - T preds[ITEMS_PER_THREAD]; - FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); - } - - - - //@} end member group - /******************************************************************//** - * \name Tail flag operations - *********************************************************************/ - //@{ - - - /** - * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged. - * - * \par - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is always flagged. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute tail flags for discontinuities in the segment - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }. - * The corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagTails( - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first item - temp_storage.first_items[linear_tid] = input[0]; - - CTA_SYNC(); - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - /** - * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is compared - * against \p tile_successor_item. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Have thread127 obtain the successor item for the entire tile - * int tile_successor_item; - * if (threadIdx.x == 127) tile_successor_item == ... - * - * // Collectively compute tail flags for discontinuities in the segment - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails( - * tail_flags, thread_data, cub::Inequality(), tile_successor_item); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } - * and that \p tile_successor_item is \p 125. The corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagTails( - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - { - // Share first item - temp_storage.first_items[linear_tid] = input[0]; - - CTA_SYNC(); - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - //@} end member group - /******************************************************************//** - * \name Head & tail flag operations - *********************************************************************/ - //@{ - - - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is always flagged. - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is always flagged. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head- and tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute head and flags for discontinuities in the segment - * int head_flags[4]; - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails( - * head_flags, tail_flags, thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } - * and that the tile_successor_item is \p 125. The corresponding output \p head_flags - * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = temp_storage.last_items[linear_tid - 1]; - if (linear_tid == 0) - { - head_flags[0] = 1; - } - else - { - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - } - - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is always flagged. - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is compared - * against \p tile_predecessor_item. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head- and tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Have thread127 obtain the successor item for the entire tile - * int tile_successor_item; - * if (threadIdx.x == 127) tile_successor_item == ... - * - * // Collectively compute head and flags for discontinuities in the segment - * int head_flags[4]; - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails( - * head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } - * and that the tile_successor_item is \p 125. The corresponding output \p head_flags - * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - if (linear_tid == 0) - { - head_flags[0] = 1; - } - else - { - preds[0] = temp_storage.last_items[linear_tid - 1]; - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - } - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is compared - * against \p tile_predecessor_item. - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is always flagged. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head- and tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Have thread0 obtain the predecessor item for the entire tile - * int tile_predecessor_item; - * if (threadIdx.x == 0) tile_predecessor_item == ... - * - * // Have thread127 obtain the successor item for the entire tile - * int tile_successor_item; - * if (threadIdx.x == 127) tile_successor_item == ... - * - * // Collectively compute head and flags for discontinuities in the segment - * int head_flags[4]; - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails( - * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, - * thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, - * that the \p tile_predecessor_item is \p 0, and that the - * \p tile_successor_item is \p 125. The corresponding output \p head_flags - * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is compared - * against \p tile_predecessor_item. - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is compared - * against \p tile_successor_item. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head- and tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Have thread0 obtain the predecessor item for the entire tile - * int tile_predecessor_item; - * if (threadIdx.x == 0) tile_predecessor_item == ... - * - * // Have thread127 obtain the successor item for the entire tile - * int tile_successor_item; - * if (threadIdx.x == 127) tile_successor_item == ... - * - * // Collectively compute head and flags for discontinuities in the segment - * int head_flags[4]; - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails( - * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, - * thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, - * that the \p tile_predecessor_item is \p 0, and that the - * \p tile_successor_item is \p 125. The corresponding output \p head_flags - * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - - - //@} end member group - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_exchange.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_exchange.cuh deleted file mode 100644 index c0e32fd..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_exchange.cuh +++ /dev/null @@ -1,1248 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../util_ptx.cuh" -#include "../util_arch.cuh" -#include "../util_macro.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief The BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png) - * \ingroup BlockModule - * - * \tparam T The data type to be exchanged. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread. - * \tparam WARP_TIME_SLICING [optional] When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - It is commonplace for blocks of threads to rearrange data items between - * threads. For example, the device-accessible memory subsystem prefers access patterns - * where data items are "striped" across threads (where consecutive threads access consecutive items), - * yet most block-wide operations prefer a "blocked" partitioning of items across threads - * (where consecutive items belong to a single thread). - * - BlockExchange supports the following types of data exchanges: - * - Transposing between [blocked](index.html#sec5sec3) and [striped](index.html#sec5sec3) arrangements - * - Transposing between [blocked](index.html#sec5sec3) and [warp-striped](index.html#sec5sec3) arrangements - * - Scattering ranked items to a [blocked arrangement](index.html#sec5sec3) - * - Scattering ranked items to a [striped arrangement](index.html#sec5sec3) - * - \rowmajor - * - * \par A Simple Example - * \blockcollective{BlockExchange} - * \par - * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockExchange BlockExchange; - * - * // Allocate shared memory for BlockExchange - * __shared__ typename BlockExchange::TempStorage temp_storage; - * - * // Load a tile of data striped across threads - * int thread_data[4]; - * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); - * - * // Collectively exchange data into a blocked arrangement across threads - * BlockExchange(temp_storage).StripedToBlocked(thread_data); - * - * \endcode - * \par - * Suppose the set of striped input \p thread_data across the block of threads is - * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }. - * The corresponding output \p thread_data in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * - * \par Performance Considerations - * - Proper device-specific padding ensures zero bank conflicts for most types. - * - */ -template < - typename InputT, - int BLOCK_DIM_X, - int ITEMS_PER_THREAD, - bool WARP_TIME_SLICING = false, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockExchange -{ -private: - - /****************************************************************************** - * Constants - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), - SMEM_BANKS = 1 << LOG_SMEM_BANKS, - - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1, - - TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS, - TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD, - - WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS), - WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD, - - // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) - INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), - PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0, - }; - - /****************************************************************************** - * Type definitions - ******************************************************************************/ - - /// Shared memory storage layout type - struct __align__(16) _TempStorage - { - InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; - }; - -public: - - /// \smemstorage{BlockExchange} - struct TempStorage : Uninitialized<_TempStorage> {}; - -private: - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - unsigned int lane_id; - unsigned int warp_id; - unsigned int warp_offset; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /** - * Transposes data items from blocked arrangement to striped arrangement. Specialized for no timeslicing. - */ - template - __device__ __forceinline__ void BlockedToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - /** - * Transposes data items from blocked arrangement to striped arrangement. Specialized for warp-timeslicing. - */ - template - __device__ __forceinline__ void BlockedToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - InputT temp_items[ITEMS_PER_THREAD]; - - #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) - { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; - - CTA_SYNC(); - - if (warp_id == SLICE) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - // Read a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; - - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) - { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) - { - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - } - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - output_items[ITEM] = temp_items[ITEM]; - } - } - - - /** - * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for no timeslicing - */ - template - __device__ __forceinline__ void BlockedToWarpStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - /** - * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for warp-timeslicing - */ - template - __device__ __forceinline__ void BlockedToWarpStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - if (warp_id == 0) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - #pragma unroll - for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE) - { - CTA_SYNC(); - - if (warp_id == SLICE) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - } - - - /** - * Transposes data items from striped arrangement to blocked arrangement. Specialized for no timeslicing. - */ - template - __device__ __forceinline__ void StripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - // No timeslicing - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - /** - * Transposes data items from striped arrangement to blocked arrangement. Specialized for warp-timeslicing. - */ - template - __device__ __forceinline__ void StripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - // Warp time-slicing - InputT temp_items[ITEMS_PER_THREAD]; - - #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) - { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - // Write a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; - - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) - { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) - { - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - } - } - - CTA_SYNC(); - - if (warp_id == SLICE) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - output_items[ITEM] = temp_items[ITEM]; - } - } - - - /** - * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for no timeslicing - */ - template - __device__ __forceinline__ void WarpStripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - /** - * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for warp-timeslicing - */ - template - __device__ __forceinline__ void WarpStripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - #pragma unroll - for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) - { - CTA_SYNC(); - - if (warp_id == SLICE) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - } - - - /** - * Exchanges data items annotated by rank into blocked arrangement. Specialized for no timeslicing. - */ - template - __device__ __forceinline__ void ScatterToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - /** - * Exchanges data items annotated by rank into blocked arrangement. Specialized for warp-timeslicing. - */ - template - __device__ __forceinline__ void ScatterToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) - { - InputT temp_items[ITEMS_PER_THREAD]; - - #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) - { - CTA_SYNC(); - - const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM] - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) - { - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - temp_storage.buff[item_offset] = input_items[ITEM]; - } - } - - CTA_SYNC(); - - if (warp_id == SLICE) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - temp_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - output_items[ITEM] = temp_items[ITEM]; - } - } - - - /** - * Exchanges data items annotated by rank into striped arrangement. Specialized for no timeslicing. - */ - template - __device__ __forceinline__ void ScatterToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - /** - * Exchanges data items annotated by rank into striped arrangement. Specialized for warp-timeslicing. - */ - template - __device__ __forceinline__ void ScatterToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) - { - InputT temp_items[ITEMS_PER_THREAD]; - - #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) - { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM] - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) - { - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - temp_storage.buff[item_offset] = input_items[ITEM]; - } - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - // Read a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; - - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) - { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) - { - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - } - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - output_items[ITEM] = temp_items[ITEM]; - } - } - - -public: - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockExchange() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - lane_id(LaneId()), - warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockExchange( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - lane_id(LaneId()), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) - {} - - - //@} end member group - /******************************************************************//** - * \name Structured exchanges - *********************************************************************/ - //@{ - - /** - * \brief Transposes data items from striped arrangement to blocked arrangement. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockExchange BlockExchange; - * - * // Allocate shared memory for BlockExchange - * __shared__ typename BlockExchange::TempStorage temp_storage; - * - * // Load a tile of ordered data into a striped arrangement across block threads - * int thread_data[4]; - * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); - * - * // Collectively exchange data into a blocked arrangement across threads - * BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of striped input \p thread_data across the block of threads is - * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from device-accessible memory. - * The corresponding output \p thread_data in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * - */ - template - __device__ __forceinline__ void StripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. - { - StripedToBlocked(input_items, output_items, Int2Type()); - } - - - /** - * \brief Transposes data items from blocked arrangement to striped arrangement. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockExchange BlockExchange; - * - * // Allocate shared memory for BlockExchange - * __shared__ typename BlockExchange::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively exchange data into a striped arrangement across threads - * BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data); - * - * // Store data striped across block threads into an ordered tile - * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); - * - * \endcode - * \par - * Suppose the set of blocked input \p thread_data across the block of threads is - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * The corresponding output \p thread_data in those threads will be - * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } in - * preparation for storing to device-accessible memory. - * - */ - template - __device__ __forceinline__ void BlockedToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. - { - BlockedToStriped(input_items, output_items, Int2Type()); - } - - - - /** - * \brief Transposes data items from warp-striped arrangement to blocked arrangement. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockExchange BlockExchange; - * - * // Allocate shared memory for BlockExchange - * __shared__ typename BlockExchange::TempStorage temp_storage; - * - * // Load a tile of ordered data into a warp-striped arrangement across warp threads - * int thread_data[4]; - * cub::LoadSWarptriped(threadIdx.x, d_data, thread_data); - * - * // Collectively exchange data into a blocked arrangement across threads - * BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); - * - * \endcode - * \par - * Suppose the set of warp-striped input \p thread_data across the block of threads is - * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } - * after loading from device-accessible memory. (The first 128 items are striped across - * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) - * The corresponding output \p thread_data in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * - */ - template - __device__ __forceinline__ void WarpStripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. - { - WarpStripedToBlocked(input_items, output_items, Int2Type()); - } - - - - /** - * \brief Transposes data items from blocked arrangement to warp-striped arrangement. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockExchange BlockExchange; - * - * // Allocate shared memory for BlockExchange - * __shared__ typename BlockExchange::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively exchange data into a warp-striped arrangement across threads - * BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data); - * - * // Store data striped across warp threads into an ordered tile - * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); - * - * \endcode - * \par - * Suppose the set of blocked input \p thread_data across the block of threads is - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * The corresponding output \p thread_data in those threads will be - * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } - * in preparation for storing to device-accessible memory. (The first 128 items are striped across - * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) - * - */ - template - __device__ __forceinline__ void BlockedToWarpStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. - { - BlockedToWarpStriped(input_items, output_items, Int2Type()); - } - - - - //@} end member group - /******************************************************************//** - * \name Scatter exchanges - *********************************************************************/ - //@{ - - - /** - * \brief Exchanges data items annotated by rank into blocked arrangement. - * - * \par - * - \smemreuse - * - * \tparam OffsetT [inferred] Signed integer type for local offsets - */ - template - __device__ __forceinline__ void ScatterToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - ScatterToBlocked(input_items, output_items, ranks, Int2Type()); - } - - - - /** - * \brief Exchanges data items annotated by rank into striped arrangement. - * - * \par - * - \smemreuse - * - * \tparam OffsetT [inferred] Signed integer type for local offsets - */ - template - __device__ __forceinline__ void ScatterToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - ScatterToStriped(input_items, output_items, ranks, Int2Type()); - } - - - - /** - * \brief Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged. - * - * \par - * - \smemreuse - * - * \tparam OffsetT [inferred] Signed integer type for local offsets - */ - template - __device__ __forceinline__ void ScatterToStripedGuarded( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - if (ranks[ITEM] >= 0) - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - - - /** - * \brief Exchanges valid data items annotated by rank into striped arrangement. - * - * \par - * - \smemreuse - * - * \tparam OffsetT [inferred] Signed integer type for local offsets - * \tparam ValidFlag [inferred] FlagT type denoting which items are valid - */ - template - __device__ __forceinline__ void ScatterToStripedFlagged( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - if (is_valid[ITEM]) - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - //@} end member group - - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - - __device__ __forceinline__ void StripedToBlocked( - InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - { - StripedToBlocked(items, items); - } - - __device__ __forceinline__ void BlockedToStriped( - InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - { - BlockedToStriped(items, items); - } - - __device__ __forceinline__ void WarpStripedToBlocked( - InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - { - WarpStripedToBlocked(items, items); - } - - __device__ __forceinline__ void BlockedToWarpStriped( - InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - { - BlockedToWarpStriped(items, items); - } - - template - __device__ __forceinline__ void ScatterToBlocked( - InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - ScatterToBlocked(items, items, ranks); - } - - template - __device__ __forceinline__ void ScatterToStriped( - InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - ScatterToStriped(items, items, ranks); - } - - template - __device__ __forceinline__ void ScatterToStripedGuarded( - InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - ScatterToStripedGuarded(items, items, ranks); - } - - template - __device__ __forceinline__ void ScatterToStripedFlagged( - InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity - { - ScatterToStriped(items, items, ranks, is_valid); - } - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -}; - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -template < - typename T, - int ITEMS_PER_THREAD, - int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> -class WarpExchange -{ -private: - - /****************************************************************************** - * Constants - ******************************************************************************/ - - /// Constants - enum - { - // Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - WARP_ITEMS = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1, - - LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), - SMEM_BANKS = 1 << LOG_SMEM_BANKS, - - // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) - INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), - PADDING_ITEMS = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0, - }; - - /****************************************************************************** - * Type definitions - ******************************************************************************/ - - /// Shared memory storage layout type - struct _TempStorage - { - T buff[WARP_ITEMS + PADDING_ITEMS]; - }; - -public: - - /// \smemstorage{WarpExchange} - struct TempStorage : Uninitialized<_TempStorage> {}; - -private: - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - _TempStorage &temp_storage; - int lane_id; - -public: - - /****************************************************************************** - * Construction - ******************************************************************************/ - - /// Constructor - __device__ __forceinline__ WarpExchange( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS) - {} - - - /****************************************************************************** - * Interface - ******************************************************************************/ - - /** - * \brief Exchanges valid data items annotated by rank into striped arrangement. - * - * \par - * - \smemreuse - * - * \tparam OffsetT [inferred] Signed integer type for local offsets - */ - template - __device__ __forceinline__ void ScatterToStriped( - T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]); - temp_storage.buff[ranks[ITEM]] = items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - items[ITEM] = temp_storage.buff[item_offset]; - } - } - -}; - - - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_histogram.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_histogram.cuh deleted file mode 100644 index 5d393c2..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_histogram.cuh +++ /dev/null @@ -1,415 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. - */ - -#pragma once - -#include "specializations/block_histogram_sort.cuh" -#include "specializations/block_histogram_atomic.cuh" -#include "../util_ptx.cuh" -#include "../util_arch.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - -/** - * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms. - */ -enum BlockHistogramAlgorithm -{ - - /** - * \par Overview - * Sorting followed by differentiation. Execution is comprised of two phases: - * -# Sort the data using efficient radix sort - * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts. - * - * \par Performance Considerations - * Delivers consistent throughput regardless of sample bin distribution. - */ - BLOCK_HISTO_SORT, - - - /** - * \par Overview - * Use atomic addition to update byte counts directly - * - * \par Performance Considerations - * Performance is strongly tied to the hardware implementation of atomic - * addition, and may be significantly degraded for non uniformly-random - * input distributions where many concurrent updates are likely to be - * made to the same bin counter. - */ - BLOCK_HISTO_ATOMIC, -}; - - - -/****************************************************************************** - * Block histogram - ******************************************************************************/ - - -/** - * \brief The BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png) - * \ingroup BlockModule - * - * \tparam T The sample type being histogrammed (must be castable to an integer bin identifier) - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of items per thread - * \tparam BINS The number bins within the histogram - * \tparam ALGORITHM [optional] cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - A histogram - * counts the number of observations that fall into each of the disjoint categories (known as bins). - * - BlockHistogram can be optionally specialized to use different algorithms: - * -# cub::BLOCK_HISTO_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm) - * -# cub::BLOCK_HISTO_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm) - * - * \par Performance Considerations - * - \granularity - * - * \par A Simple Example - * \blockcollective{BlockHistogram} - * \par - * The code snippet below illustrates a 256-bin histogram of 512 integer samples that - * are partitioned across 128 threads where each thread owns 4 samples. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each - * typedef cub::BlockHistogram BlockHistogram; - * - * // Allocate shared memory for BlockHistogram - * __shared__ typename BlockHistogram::TempStorage temp_storage; - * - * // Allocate shared memory for block-wide histogram bin counts - * __shared__ unsigned int smem_histogram[256]; - * - * // Obtain input samples per thread - * unsigned char data[4]; - * ... - * - * // Compute the block-wide histogram - * BlockHistogram(temp_storage).Histogram(data, smem_histogram); - * - * \endcode - * - * \par Performance and Usage Considerations - * - The histogram output can be constructed in shared or device-accessible memory - * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives - * - */ -template < - typename T, - int BLOCK_DIM_X, - int ITEMS_PER_THREAD, - int BINS, - BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockHistogram -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - /** - * Ensure the template parameterization meets the requirements of the - * targeted device architecture. BLOCK_HISTO_ATOMIC can only be used - * on version SM120 or later. Otherwise BLOCK_HISTO_SORT is used - * regardless. - */ - static const BlockHistogramAlgorithm SAFE_ALGORITHM = - ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ? - BLOCK_HISTO_SORT : - ALGORITHM; - - /// Internal specialization. - typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT), - BlockHistogramSort, - BlockHistogramAtomic >::Type InternalBlockHistogram; - - /// Shared memory storage layout type for BlockHistogram - typedef typename InternalBlockHistogram::TempStorage _TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - -public: - - /// \smemstorage{BlockHistogram} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockHistogram() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockHistogram( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Histogram operations - *********************************************************************/ - //@{ - - - /** - * \brief Initialize the shared histogram counters to zero. - * - * \par Snippet - * The code snippet below illustrates a the initialization and update of a - * histogram of 512 integer samples that are partitioned across 128 threads - * where each thread owns 4 samples. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each - * typedef cub::BlockHistogram BlockHistogram; - * - * // Allocate shared memory for BlockHistogram - * __shared__ typename BlockHistogram::TempStorage temp_storage; - * - * // Allocate shared memory for block-wide histogram bin counts - * __shared__ unsigned int smem_histogram[256]; - * - * // Obtain input samples per thread - * unsigned char thread_samples[4]; - * ... - * - * // Initialize the block-wide histogram - * BlockHistogram(temp_storage).InitHistogram(smem_histogram); - * - * // Update the block-wide histogram - * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); - * - * \endcode - * - * \tparam CounterT [inferred] Histogram counter type - */ - template - __device__ __forceinline__ void InitHistogram(CounterT histogram[BINS]) - { - // Initialize histogram bin counts to zeros - int histo_offset = 0; - - #pragma unroll - for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) - { - histogram[histo_offset + linear_tid] = 0; - } - // Finish up with guarded initialization if necessary - if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) - { - histogram[histo_offset + linear_tid] = 0; - } - } - - - /** - * \brief Constructs a block-wide histogram in shared/device-accessible memory. Each thread contributes an array of input elements. - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a 256-bin histogram of 512 integer samples that - * are partitioned across 128 threads where each thread owns 4 samples. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each - * typedef cub::BlockHistogram BlockHistogram; - * - * // Allocate shared memory for BlockHistogram - * __shared__ typename BlockHistogram::TempStorage temp_storage; - * - * // Allocate shared memory for block-wide histogram bin counts - * __shared__ unsigned int smem_histogram[256]; - * - * // Obtain input samples per thread - * unsigned char thread_samples[4]; - * ... - * - * // Compute the block-wide histogram - * BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram); - * - * \endcode - * - * \tparam CounterT [inferred] Histogram counter type - */ - template < - typename CounterT > - __device__ __forceinline__ void Histogram( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram - { - // Initialize histogram bin counts to zeros - InitHistogram(histogram); - - CTA_SYNC(); - - // Composite the histogram - InternalBlockHistogram(temp_storage).Composite(items, histogram); - } - - - - /** - * \brief Updates an existing block-wide histogram in shared/device-accessible memory. Each thread composites an array of input elements. - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a the initialization and update of a - * histogram of 512 integer samples that are partitioned across 128 threads - * where each thread owns 4 samples. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each - * typedef cub::BlockHistogram BlockHistogram; - * - * // Allocate shared memory for BlockHistogram - * __shared__ typename BlockHistogram::TempStorage temp_storage; - * - * // Allocate shared memory for block-wide histogram bin counts - * __shared__ unsigned int smem_histogram[256]; - * - * // Obtain input samples per thread - * unsigned char thread_samples[4]; - * ... - * - * // Initialize the block-wide histogram - * BlockHistogram(temp_storage).InitHistogram(smem_histogram); - * - * // Update the block-wide histogram - * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); - * - * \endcode - * - * \tparam CounterT [inferred] Histogram counter type - */ - template < - typename CounterT > - __device__ __forceinline__ void Composite( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram - { - InternalBlockHistogram(temp_storage).Composite(items, histogram); - } - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_load.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_load.cuh deleted file mode 100644 index 234dad2..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_load.cuh +++ /dev/null @@ -1,1268 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Operations for reading linear tiles of data into the CUDA thread block. - */ - -#pragma once - -#include - -#include "block_exchange.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_ptx.cuh" -#include "../util_macro.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIo - * @{ - */ - - -/******************************************************************//** - * \name Blocked arrangement I/O (direct) - *********************************************************************/ -//@{ - - -/** - * \brief Load a linear segment of items into a blocked arrangement across the thread block. - * - * \blocked - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); - - // Load directly in thread-blocked order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = thread_itr[ITEM]; - } -} - - -/** - * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. - * - * \blocked - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load -{ - InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items) - { - items[ITEM] = thread_itr[ITEM]; - } - } -} - - -/** - * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. - * - * \blocked - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - typename DefaultT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items -{ - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - items[ITEM] = oob_default; - - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); -} - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Internal implementation for load vectorization - */ -template < - CacheLoadModifier MODIFIER, - typename T, - int ITEMS_PER_THREAD> -__device__ __forceinline__ void InternalLoadDirectBlockedVectorized( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - T *block_ptr, ///< [in] Input pointer for loading from - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - // Biggest memory access word that T is a whole multiple of - typedef typename UnitWord::DeviceWord DeviceWord; - - enum - { - TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord), - - VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ? - 4 : - (TOTAL_WORDS % 2 == 0) ? - 2 : - 1, - - VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE, - }; - - // Vector type - typedef typename CubVector::Type Vector; - - // Vector items - Vector vec_items[VECTORS_PER_THREAD]; - - // Aliased input ptr - Vector* vec_ptr = reinterpret_cast(block_ptr) + (linear_tid * VECTORS_PER_THREAD); - - // Load directly in thread-blocked order - #pragma unroll - for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++) - { - vec_items[ITEM] = ThreadLoad(vec_ptr + ITEM); - } - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = *(reinterpret_cast(vec_items) + ITEM); - } -} - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** - * \brief Load a linear segment of items into a blocked arrangement across the thread block. - * - * \blocked - * - * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned - * - * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ -template < - typename T, - int ITEMS_PER_THREAD> -__device__ __forceinline__ void LoadDirectBlockedVectorized( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - T *block_ptr, ///< [in] Input pointer for loading from - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); -} - - -//@} end member group -/******************************************************************//** - * \name Striped arrangement I/O (direct) - *********************************************************************/ -//@{ - - -/** - * \brief Load a linear segment of items into a striped arrangement across the thread block. - * - * \striped - * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - int BLOCK_THREADS, - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - InputIteratorT thread_itr = block_itr + linear_tid; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; - } -} - - -/** - * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range - * - * \striped - * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - int BLOCK_THREADS, - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load -{ - InputIteratorT thread_itr = block_itr + linear_tid; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items) - { - items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; - } - } -} - - -/** - * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. - * - * \striped - * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - int BLOCK_THREADS, - typename InputT, - typename DefaultT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items -{ - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - items[ITEM] = oob_default; - - LoadDirectStriped(linear_tid, block_itr, items, valid_items); -} - - - -//@} end member group -/******************************************************************//** - * \name Warp-striped arrangement I/O (direct) - *********************************************************************/ -//@{ - - -/** - * \brief Load a linear segment of items into a warp-striped arrangement across the thread block. - * - * \warpstriped - * - * \par Usage Considerations - * The number of threads in the thread block must be a multiple of the architecture's warp size. - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; - - InputIteratorT thread_itr = block_itr + warp_offset + tid ; - - // Load directly in warp-striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)]; - } -} - - -/** - * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range - * - * \warpstriped - * - * \par Usage Considerations - * The number of threads in the thread block must be a multiple of the architecture's warp size. - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load -{ - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; - - InputIteratorT thread_itr = block_itr + warp_offset + tid ; - - // Load directly in warp-striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) - { - items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)]; - } - } -} - - -/** - * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. - * - * \warpstriped - * - * \par Usage Considerations - * The number of threads in the thread block must be a multiple of the architecture's warp size. - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - typename DefaultT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items -{ - // Load directly in warp-striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - items[ITEM] = oob_default; - - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); -} - - - -//@} end member group - -/** @} */ // end group UtilIo - - - -//----------------------------------------------------------------------------- -// Generic BlockLoad abstraction -//----------------------------------------------------------------------------- - -/** - * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. - */ - -/** - * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. - */ -enum BlockLoadAlgorithm -{ - /** - * \par Overview - * - * A [blocked arrangement](index.html#sec5sec3) of data is read - * directly from memory. - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) decreases as the - * access stride between threads increases (i.e., the number items per thread). - */ - BLOCK_LOAD_DIRECT, - - /** - * \par Overview - * - * A [blocked arrangement](index.html#sec5sec3) of data is read - * from memory using CUDA's built-in vectorized loads as a coalescing optimization. - * For example, ld.global.v4.s32 instructions will be generated - * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high until the the - * access stride between threads (i.e., the number items per thread) exceeds the - * maximum vector load width (typically 4 items or 64B, whichever is lower). - * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The \p InputIteratorTis not a simple pointer type - * - The block input offset is not quadword-aligned - * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) - */ - BLOCK_LOAD_VECTORIZE, - - /** - * \par Overview - * - * A [striped arrangement](index.html#sec5sec3) of data is read - * efficiently from memory and then locally transposed into a - * [blocked arrangement](index.html#sec5sec3). - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items loaded per thread. - * - The local reordering incurs slightly longer latencies and throughput than the - * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. - */ - BLOCK_LOAD_TRANSPOSE, - - - /** - * \par Overview - * - * A [warp-striped arrangement](index.html#sec5sec3) of data is - * read efficiently from memory and then locally transposed into a - * [blocked arrangement](index.html#sec5sec3). - * - * \par Usage Considerations - * - BLOCK_THREADS must be a multiple of WARP_THREADS - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items loaded per thread. - * - The local reordering incurs slightly larger latencies than the - * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. - * - Provisions more shared storage, but incurs smaller latencies than the - * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative. - */ - BLOCK_LOAD_WARP_TRANSPOSE, - - - /** - * \par Overview - * - * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [warp-striped arrangement](index.html#sec5sec3) - * of data is read directly from memory and then is locally transposed into a - * [blocked arrangement](index.html#sec5sec3). To reduce the shared memory - * requirement, only one warp's worth of shared memory is provisioned and is - * subsequently time-sliced among warps. - * - * \par Usage Considerations - * - BLOCK_THREADS must be a multiple of WARP_THREADS - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items loaded per thread. - * - Provisions less shared memory temporary storage, but incurs larger - * latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative. - */ - BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, -}; - - -/** - * \brief The BlockLoad class provides [collective](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [blocked arrangement](index.html#sec5sec3) across a CUDA thread block. ![](block_load_logo.png) - * \ingroup BlockModule - * \ingroup UtilIo - * - * \tparam InputT The data type to read into (which must be convertible from the input iterator's value type). - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. - * \tparam ALGORITHM [optional] cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT. - * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - The BlockLoad class provides a single data movement abstraction that can be specialized - * to implement different cub::BlockLoadAlgorithm strategies. This facilitates different - * performance policies for different architectures, data types, granularity sizes, etc. - * - BlockLoad can be optionally specialized by different data movement strategies: - * -# cub::BLOCK_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3) - * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) - * -# cub::BLOCK_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) - * of data is read directly from memory using CUDA's built-in vectorized loads as a - * coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm) - * -# cub::BLOCK_LOAD_TRANSPOSE. A [striped arrangement](index.html#sec5sec3) - * of data is read directly from memory and is then locally transposed into a - * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) - * -# cub::BLOCK_LOAD_WARP_TRANSPOSE. A [warp-striped arrangement](index.html#sec5sec3) - * of data is read directly from memory and is then locally transposed into a - * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) - * -# cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,. A [warp-striped arrangement](index.html#sec5sec3) - * of data is read directly from memory and is then locally transposed into a - * [blocked arrangement](index.html#sec5sec3) one warp at a time. [More...](\ref cub::BlockLoadAlgorithm) - * - \rowmajor - * - * \par A Simple Example - * \blockcollective{BlockLoad} - * \par - * The code snippet below illustrates the loading of a linear - * segment of 512 integers into a "blocked" arrangement across 128 threads where each - * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, - * meaning memory references are efficiently coalesced using a warp-striped access - * pattern (after which items are locally reordered among threads). - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockLoad BlockLoad; - * - * // Allocate shared memory for BlockLoad - * __shared__ typename BlockLoad::TempStorage temp_storage; - * - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage).Load(d_data, thread_data); - * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... - * The set of \p thread_data across the block of threads in those threads will be - * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * - */ -template < - typename InputT, - int BLOCK_DIM_X, - int ITEMS_PER_THREAD, - BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockLoad -{ -private: - - /****************************************************************************** - * Constants and typed definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - - /****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - - /// Load helper - template - struct LoadInternal; - - - /** - * BLOCK_LOAD_DIRECT specialization of load helper - */ - template - struct LoadInternal - { - /// Shared memory storage layout type - typedef NullType TempStorage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ LoadInternal( - TempStorage &/*temp_storage*/, - int linear_tid) - : - linear_tid(linear_tid) - {} - - /// Load a linear segment of items from memory - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - LoadDirectBlocked(linear_tid, block_itr, items); - } - - /// Load a linear segment of items from memory, guarded by range - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); - } - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); - } - - }; - - - /** - * BLOCK_LOAD_VECTORIZE specialization of load helper - */ - template - struct LoadInternal - { - /// Shared memory storage layout type - typedef NullType TempStorage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ LoadInternal( - TempStorage &/*temp_storage*/, - int linear_tid) - : - linear_tid(linear_tid) - {} - - /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) - template - __device__ __forceinline__ void Load( - InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); - } - - /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) - template - __device__ __forceinline__ void Load( - const InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); - } - - /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) - template < - CacheLoadModifier MODIFIER, - typename ValueType, - typename OffsetT> - __device__ __forceinline__ void Load( - CacheModifiedInputIterator block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); - } - - /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization) - template - __device__ __forceinline__ void Load( - _InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - LoadDirectBlocked(linear_tid, block_itr, items); - } - - /// Load a linear segment of items from memory, guarded by range (skips vectorization) - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); - } - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization) - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); - } - - }; - - - /** - * BLOCK_LOAD_TRANSPOSE specialization of load helper - */ - template - struct LoadInternal - { - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ LoadInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Load a linear segment of items from memory - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ - { - LoadDirectStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).StripedToBlocked(items, items); - } - - /// Load a linear segment of items from memory, guarded by range - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); - BlockExchange(temp_storage).StripedToBlocked(items, items); - } - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default); - BlockExchange(temp_storage).StripedToBlocked(items, items); - } - - }; - - - /** - * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper - */ - template - struct LoadInternal - { - enum - { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) - }; - - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); - - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ LoadInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Load a linear segment of items from memory - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ - { - LoadDirectWarpStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - - /// Load a linear segment of items from memory, guarded by range - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - }; - - - /** - * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper - */ - template - struct LoadInternal - { - enum - { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) - }; - - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); - - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ LoadInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Load a linear segment of items from memory - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ - { - LoadDirectWarpStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - - /// Load a linear segment of items from memory, guarded by range - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - }; - - - /****************************************************************************** - * Type definitions - ******************************************************************************/ - - /// Internal load implementation to use - typedef LoadInternal InternalLoad; - - - /// Shared memory storage layout type - typedef typename InternalLoad::TempStorage _TempStorage; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - -public: - - /// \smemstorage{BlockLoad} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockLoad() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockLoad( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - - - //@} end member group - /******************************************************************//** - * \name Data movement - *********************************************************************/ - //@{ - - - /** - * \brief Load a linear segment of items from memory. - * - * \par - * - \blocked - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the loading of a linear - * segment of 512 integers into a "blocked" arrangement across 128 threads where each - * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, - * meaning memory references are efficiently coalesced using a warp-striped access - * pattern (after which items are locally reordered among threads). - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockLoad BlockLoad; - * - * // Allocate shared memory for BlockLoad - * __shared__ typename BlockLoad::TempStorage temp_storage; - * - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage).Load(d_data, thread_data); - * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... - * The set of \p thread_data across the block of threads in those threads will be - * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * - */ - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items); - } - - - /** - * \brief Load a linear segment of items from memory, guarded by range. - * - * \par - * - \blocked - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the guarded loading of a linear - * segment of 512 integers into a "blocked" arrangement across 128 threads where each - * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, - * meaning memory references are efficiently coalesced using a warp-striped access - * pattern (after which items are locally reordered among threads). - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, int valid_items, ...) - * { - * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockLoad BlockLoad; - * - * // Allocate shared memory for BlockLoad - * __shared__ typename BlockLoad::TempStorage temp_storage; - * - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items); - * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6... and \p valid_items is \p 5. - * The set of \p thread_data across the block of threads in those threads will be - * { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }, with only the first two threads - * being unmasked to load portions of valid data (and other items remaining unassigned). - * - */ - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); - } - - - /** - * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements - * - * \par - * - \blocked - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the guarded loading of a linear - * segment of 512 integers into a "blocked" arrangement across 128 threads where each - * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, - * meaning memory references are efficiently coalesced using a warp-striped access - * pattern (after which items are locally reordered among threads). - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, int valid_items, ...) - * { - * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockLoad BlockLoad; - * - * // Allocate shared memory for BlockLoad - * __shared__ typename BlockLoad::TempStorage temp_storage; - * - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1); - * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6..., - * \p valid_items is \p 5, and the out-of-bounds default is \p -1. - * The set of \p thread_data across the block of threads in those threads will be - * { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }, with only the first two threads - * being unmasked to load portions of valid data (and other items are assigned \p -1) - * - */ - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); - } - - - //@} end member group - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_rank.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_rank.cuh deleted file mode 100644 index 77500ba..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_rank.cuh +++ /dev/null @@ -1,697 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block - */ - -#pragma once - -#include - -#include "../thread/thread_reduce.cuh" -#include "../thread/thread_scan.cuh" -#include "../block/block_scan.cuh" -#include "../util_ptx.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block. - * \ingroup BlockModule - * - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam RADIX_BITS The number of radix bits per digit place - * \tparam IS_DESCENDING Whether or not the sorted-order is high-to-low - * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. - * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) - * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * Blah... - * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits). - * - \blocked - * - * \par Performance Considerations - * - \granularity - * - * \par Examples - * \par - * - Example 1: Simple radix rank of 32-bit integer keys - * \code - * #include - * - * template - * __global__ void ExampleKernel(...) - * { - * - * \endcode - */ -template < - int BLOCK_DIM_X, - int RADIX_BITS, - bool IS_DESCENDING, - bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, - BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, - cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockRadixRank -{ -private: - - /****************************************************************************** - * Type definitions and constants - ******************************************************************************/ - - // Integer type for digit counters (to be packed into words of type PackedCounters) - typedef unsigned short DigitCounter; - - // Integer type for packing DigitCounters into columns of shared memory banks - typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte), - unsigned long long, - unsigned int>::Type PackedCounter; - - enum - { - // The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - RADIX_DIGITS = 1 << RADIX_BITS, - - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - BYTES_PER_COUNTER = sizeof(DigitCounter), - LOG_BYTES_PER_COUNTER = Log2::VALUE, - - PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), - LOG_PACKING_RATIO = Log2::VALUE, - - LOG_COUNTER_LANES = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0), // Always at least one lane - COUNTER_LANES = 1 << LOG_COUNTER_LANES, - - // The number of packed counters per thread (plus one for padding) - PADDED_COUNTER_LANES = COUNTER_LANES + 1, - RAKING_SEGMENT = PADDED_COUNTER_LANES, - }; - -public: - - enum - { - /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS), - }; - -private: - - - /// BlockScan type - typedef BlockScan< - PackedCounter, - BLOCK_DIM_X, - INNER_SCAN_ALGORITHM, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - BlockScan; - - - /// Shared memory storage layout type for BlockRadixRank - struct __align__(16) _TempStorage - { - union Aliasable - { - DigitCounter digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; - PackedCounter raking_grid[BLOCK_THREADS][RAKING_SEGMENT]; - - } aliasable; - - // Storage for scanning local ranks - typename BlockScan::TempStorage block_scan; - }; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - /// Copy of raking segment, promoted to registers - PackedCounter cached_segment[RAKING_SEGMENT]; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /** - * Internal storage allocator - */ - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /** - * Performs upsweep raking reduction, returning the aggregate - */ - __device__ __forceinline__ PackedCounter Upsweep() - { - PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; - PackedCounter *raking_ptr; - - if (MEMOIZE_OUTER_SCAN) - { - // Copy data into registers - #pragma unroll - for (int i = 0; i < RAKING_SEGMENT; i++) - { - cached_segment[i] = smem_raking_ptr[i]; - } - raking_ptr = cached_segment; - } - else - { - raking_ptr = smem_raking_ptr; - } - - return internal::ThreadReduce(raking_ptr, Sum()); - } - - - /// Performs exclusive downsweep raking scan - __device__ __forceinline__ void ExclusiveDownsweep( - PackedCounter raking_partial) - { - PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; - - PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ? - cached_segment : - smem_raking_ptr; - - // Exclusive raking downsweep scan - internal::ThreadScanExclusive(raking_ptr, raking_ptr, Sum(), raking_partial); - - if (MEMOIZE_OUTER_SCAN) - { - // Copy data back to smem - #pragma unroll - for (int i = 0; i < RAKING_SEGMENT; i++) - { - smem_raking_ptr[i] = cached_segment[i]; - } - } - } - - - /** - * Reset shared memory digit counters - */ - __device__ __forceinline__ void ResetCounters() - { - // Reset shared memory digit counters - #pragma unroll - for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++) - { - *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0; - } - } - - - /** - * Block-scan prefix callback - */ - struct PrefixCallBack - { - __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate) - { - PackedCounter block_prefix = 0; - - // Propagate totals in packed fields - #pragma unroll - for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++) - { - block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED); - } - - return block_prefix; - } - }; - - - /** - * Scan shared memory digit counters. - */ - __device__ __forceinline__ void ScanCounters() - { - // Upsweep scan - PackedCounter raking_partial = Upsweep(); - - // Compute exclusive sum - PackedCounter exclusive_partial; - PrefixCallBack prefix_call_back; - BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back); - - // Downsweep scan with exclusive partial - ExclusiveDownsweep(exclusive_partial); - } - -public: - - /// \smemstorage{BlockScan} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockRadixRank() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockRadixRank( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Raking - *********************************************************************/ - //@{ - - /** - * \brief Rank keys. - */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits) ///< [in] The number of bits in the current digit - { - DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit - DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem - - // Reset shared memory digit counters - ResetCounters(); - - #pragma unroll - for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) - { - // Get digit - unsigned int digit = BFE(keys[ITEM], current_bit, num_bits); - - // Get sub-counter - unsigned int sub_counter = digit >> LOG_COUNTER_LANES; - - // Get counter lane - unsigned int counter_lane = digit & (COUNTER_LANES - 1); - - if (IS_DESCENDING) - { - sub_counter = PACKING_RATIO - 1 - sub_counter; - counter_lane = COUNTER_LANES - 1 - counter_lane; - } - - // Pointer to smem digit counter - digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter]; - - // Load thread-exclusive prefix - thread_prefixes[ITEM] = *digit_counters[ITEM]; - - // Store inclusive prefix - *digit_counters[ITEM] = thread_prefixes[ITEM] + 1; - } - - CTA_SYNC(); - - // Scan shared memory counters - ScanCounters(); - - CTA_SYNC(); - - // Extract the local ranks of each key - for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) - { - // Add in thread block exclusive prefix - ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM]; - } - } - - - /** - * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. - */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits, ///< [in] The number of bits in the current digit - int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] - { - // Rank keys - RankKeys(keys, ranks, current_bit, num_bits); - - // Get the inclusive and exclusive digit totals corresponding to the calling thread. - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - if (IS_DESCENDING) - bin_idx = RADIX_DIGITS - bin_idx - 1; - - // Obtain ex/inclusive digit counts. (Unfortunately these all reside in the - // first counter column, resulting in unavoidable bank conflicts.) - unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1)); - unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES); - - exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter]; - } - } - } -}; - - - - - -/** - * Radix-rank using match.any - */ -template < - int BLOCK_DIM_X, - int RADIX_BITS, - bool IS_DESCENDING, - BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockRadixRankMatch -{ -private: - - /****************************************************************************** - * Type definitions and constants - ******************************************************************************/ - - typedef int32_t RankT; - typedef int32_t DigitCounterT; - - enum - { - // The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - RADIX_DIGITS = 1 << RADIX_BITS, - - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - PADDED_WARPS = ((WARPS & 0x1) == 0) ? - WARPS + 1 : - WARPS, - - COUNTERS = PADDED_WARPS * RADIX_DIGITS, - RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS, - PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ? - RAKING_SEGMENT + 1 : - RAKING_SEGMENT, - }; - -public: - - enum - { - /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS), - }; - -private: - - /// BlockScan type - typedef BlockScan< - DigitCounterT, - BLOCK_THREADS, - INNER_SCAN_ALGORITHM, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - BlockScanT; - - - /// Shared memory storage layout type for BlockRadixRank - struct __align__(16) _TempStorage - { - typename BlockScanT::TempStorage block_scan; - - union __align__(16) Aliasable - { - volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS]; - DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT]; - - } aliasable; - }; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - - -public: - - /// \smemstorage{BlockScan} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockRadixRankMatch( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Raking - *********************************************************************/ - //@{ - - /** - * \brief Rank keys. - */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits) ///< [in] The number of bits in the current digit - { - // Initialize shared digit counters - - #pragma unroll - for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) - temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0; - - CTA_SYNC(); - - // Each warp will strip-mine its section of input, one strip at a time - - volatile DigitCounterT *digit_counters[KEYS_PER_THREAD]; - uint32_t lane_id = LaneId(); - uint32_t warp_id = linear_tid >> LOG_WARP_THREADS; - uint32_t lane_mask_lt = LaneMaskLt(); - - #pragma unroll - for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) - { - // My digit - uint32_t digit = BFE(keys[ITEM], current_bit, num_bits); - - if (IS_DESCENDING) - digit = RADIX_DIGITS - digit - 1; - - // Mask of peers who have same digit as me - uint32_t peer_mask = MatchAny(digit); - - // Pointer to smem digit counter for this key - digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id]; - - // Number of occurrences in previous strips - DigitCounterT warp_digit_prefix = *digit_counters[ITEM]; - - // Warp-sync - WARP_SYNC(0xFFFFFFFF); - - // Number of peers having same digit as me - int32_t digit_count = __popc(peer_mask); - - // Number of lower-ranked peers having same digit seen so far - int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt); - - if (peer_digit_prefix == 0) - { - // First thread for each digit updates the shared warp counter - *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count); - } - - // Warp-sync - WARP_SYNC(0xFFFFFFFF); - - // Number of prior keys having same digit - ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix); - } - - CTA_SYNC(); - - // Scan warp counters - - DigitCounterT scan_counters[PADDED_RAKING_SEGMENT]; - - #pragma unroll - for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) - scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM]; - - BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters); - - #pragma unroll - for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) - temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM]; - - CTA_SYNC(); - - // Seed ranks with counter values from previous warps - #pragma unroll - for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) - ranks[ITEM] += *digit_counters[ITEM]; - } - - - /** - * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. - */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits, ///< [in] The number of bits in the current digit - int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] - { - RankKeys(keys, ranks, current_bit, num_bits); - - // Get exclusive count for each digit - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - if (IS_DESCENDING) - bin_idx = RADIX_DIGITS - bin_idx - 1; - - exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0]; - } - } - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_sort.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_sort.cuh deleted file mode 100644 index 736fbde..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_radix_sort.cuh +++ /dev/null @@ -1,862 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockRadixSort class provides [collective](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block. - */ - - -#pragma once - -#include "block_exchange.cuh" -#include "block_radix_rank.cuh" -#include "../util_ptx.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief The BlockRadixSort class provides [collective](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method. ![](sorting_logo.png) - * \ingroup BlockModule - * - * \tparam KeyT KeyT type - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of items per thread - * \tparam ValueT [optional] ValueT type (default: cub::NullType, which indicates a keys-only sort) - * \tparam RADIX_BITS [optional] The number of radix bits per digit place (default: 4 bits) - * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). - * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) - * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges - * items into ascending order. It relies upon a positional representation for - * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, - * characters, etc.) specified from least-significant to most-significant. For a - * given input sequence of keys and a set of rules specifying a total ordering - * of the symbolic alphabet, the radix sorting method produces a lexicographic - * ordering of those keys. - * - BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: - * unsigned char, \p int, \p double, etc. Within each key, the implementation treats fixed-length - * bit-sequences of \p RADIX_BITS as radix digit places. Although the direct radix sorting - * method can only be applied to unsigned integral types, BlockRadixSort - * is able to sort signed and floating-point types via simple bit-wise transformations - * that ensure lexicographic key ordering. - * - \rowmajor - * - * \par Performance Considerations - * - \granularity - * - * \par A Simple Example - * \blockcollective{BlockRadixSort} - * \par - * The code snippet below illustrates a sort of 512 integer keys that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * ... - * - * // Collectively sort the keys - * BlockRadixSort(temp_storage).Sort(thread_keys); - * - * ... - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * - */ -template < - typename KeyT, - int BLOCK_DIM_X, - int ITEMS_PER_THREAD, - typename ValueT = NullType, - int RADIX_BITS = 4, - bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, - BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, - cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockRadixSort -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - // The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - // Whether or not there are values to be trucked along with keys - KEYS_ONLY = Equals::VALUE, - }; - - // KeyT traits and unsigned bits type - typedef Traits KeyTraits; - typedef typename KeyTraits::UnsignedBits UnsignedBits; - - /// Ascending BlockRadixRank utility type - typedef BlockRadixRank< - BLOCK_DIM_X, - RADIX_BITS, - false, - MEMOIZE_OUTER_SCAN, - INNER_SCAN_ALGORITHM, - SMEM_CONFIG, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - AscendingBlockRadixRank; - - /// Descending BlockRadixRank utility type - typedef BlockRadixRank< - BLOCK_DIM_X, - RADIX_BITS, - true, - MEMOIZE_OUTER_SCAN, - INNER_SCAN_ALGORITHM, - SMEM_CONFIG, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - DescendingBlockRadixRank; - - /// BlockExchange utility type for keys - typedef BlockExchange BlockExchangeKeys; - - /// BlockExchange utility type for values - typedef BlockExchange BlockExchangeValues; - - /// Shared memory storage layout type - union _TempStorage - { - typename AscendingBlockRadixRank::TempStorage asending_ranking_storage; - typename DescendingBlockRadixRank::TempStorage descending_ranking_storage; - typename BlockExchangeKeys::TempStorage exchange_keys; - typename BlockExchangeValues::TempStorage exchange_values; - }; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - /// Rank keys (specialized for ascending sort) - __device__ __forceinline__ void RankKeys( - UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - int begin_bit, - int pass_bits, - Int2Type /*is_descending*/) - { - AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys( - unsigned_keys, - ranks, - begin_bit, - pass_bits); - } - - /// Rank keys (specialized for descending sort) - __device__ __forceinline__ void RankKeys( - UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - int begin_bit, - int pass_bits, - Int2Type /*is_descending*/) - { - DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys( - unsigned_keys, - ranks, - begin_bit, - pass_bits); - } - - /// ExchangeValues (specialized for key-value sort, to-blocked arrangement) - __device__ __forceinline__ void ExchangeValues( - ValueT (&values)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - Int2Type /*is_keys_only*/, - Int2Type /*is_blocked*/) - { - CTA_SYNC(); - - // Exchange values through shared memory in blocked arrangement - BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks); - } - - /// ExchangeValues (specialized for key-value sort, to-striped arrangement) - __device__ __forceinline__ void ExchangeValues( - ValueT (&values)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - Int2Type /*is_keys_only*/, - Int2Type /*is_blocked*/) - { - CTA_SYNC(); - - // Exchange values through shared memory in blocked arrangement - BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); - } - - /// ExchangeValues (specialized for keys-only sort) - template - __device__ __forceinline__ void ExchangeValues( - ValueT (&/*values*/)[ITEMS_PER_THREAD], - int (&/*ranks*/)[ITEMS_PER_THREAD], - Int2Type /*is_keys_only*/, - Int2Type /*is_blocked*/) - {} - - /// Sort blocked arrangement - template - __device__ __forceinline__ void SortBlocked( - KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort - int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison - Int2Type is_descending, ///< Tag whether is a descending-order sort - Int2Type is_keys_only) ///< Tag whether is keys-only sort - { - UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = - reinterpret_cast(keys); - - // Twiddle bits if necessary - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); - } - - // Radix sorting passes - while (true) - { - int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); - - // Rank the blocked keys - int ranks[ITEMS_PER_THREAD]; - RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); - begin_bit += RADIX_BITS; - - CTA_SYNC(); - - // Exchange keys through shared memory in blocked arrangement - BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); - - // Exchange values through shared memory in blocked arrangement - ExchangeValues(values, ranks, is_keys_only, Int2Type()); - - // Quit if done - if (begin_bit >= end_bit) break; - - CTA_SYNC(); - } - - // Untwiddle bits if necessary - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); - } - } - -public: - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - /// Sort blocked -> striped arrangement - template - __device__ __forceinline__ void SortBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort - int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison - Int2Type is_descending, ///< Tag whether is a descending-order sort - Int2Type is_keys_only) ///< Tag whether is keys-only sort - { - UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = - reinterpret_cast(keys); - - // Twiddle bits if necessary - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); - } - - // Radix sorting passes - while (true) - { - int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); - - // Rank the blocked keys - int ranks[ITEMS_PER_THREAD]; - RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); - begin_bit += RADIX_BITS; - - CTA_SYNC(); - - // Check if this is the last pass - if (begin_bit >= end_bit) - { - // Last pass exchanges keys through shared memory in striped arrangement - BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks); - - // Last pass exchanges through shared memory in striped arrangement - ExchangeValues(values, ranks, is_keys_only, Int2Type()); - - // Quit - break; - } - - // Exchange keys through shared memory in blocked arrangement - BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); - - // Exchange values through shared memory in blocked arrangement - ExchangeValues(values, ranks, is_keys_only, Int2Type()); - - CTA_SYNC(); - } - - // Untwiddle bits if necessary - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); - } - } - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - /// \smemstorage{BlockRadixSort} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockRadixSort() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockRadixSort( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Sorting (blocked arrangements) - *********************************************************************/ - //@{ - - /** - * \brief Performs an ascending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive keys. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * ... - * - * // Collectively sort the keys - * BlockRadixSort(temp_storage).Sort(thread_keys); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - * The corresponding output \p thread_keys in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - */ - __device__ __forceinline__ void Sort( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - NullType values[ITEMS_PER_THREAD]; - - SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - /** - * \brief Performs an ascending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. - * - * \par - * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" - * more than one tile of values, simply perform a key-value sort of the keys paired - * with a temporary value array that enumerates the key indices. The reordered indices - * can then be used as a gather-vector for exchanging other associated tile data through - * shared memory. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys and values that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive pairs. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * int thread_values[4]; - * ... - * - * // Collectively sort the keys and values among block threads - * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * - */ - __device__ __forceinline__ void Sort( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - /** - * \brief Performs a descending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive keys. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * ... - * - * // Collectively sort the keys - * BlockRadixSort(temp_storage).Sort(thread_keys); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - * The corresponding output \p thread_keys in those threads will be - * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. - */ - __device__ __forceinline__ void SortDescending( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - NullType values[ITEMS_PER_THREAD]; - - SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - /** - * \brief Performs a descending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. - * - * \par - * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" - * more than one tile of values, simply perform a key-value sort of the keys paired - * with a temporary value array that enumerates the key indices. The reordered indices - * can then be used as a gather-vector for exchanging other associated tile data through - * shared memory. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys and values that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive pairs. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * int thread_values[4]; - * ... - * - * // Collectively sort the keys and values among block threads - * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. - * - */ - __device__ __forceinline__ void SortDescending( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - //@} end member group - /******************************************************************//** - * \name Sorting (blocked arrangement -> striped arrangement) - *********************************************************************/ - //@{ - - - /** - * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive keys. The final partitioning is striped. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * ... - * - * // Collectively sort the keys - * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. - * - */ - __device__ __forceinline__ void SortBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - NullType values[ITEMS_PER_THREAD]; - - SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - /** - * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). - * - * \par - * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" - * more than one tile of values, simply perform a key-value sort of the keys paired - * with a temporary value array that enumerates the key indices. The reordered indices - * can then be used as a gather-vector for exchanging other associated tile data through - * shared memory. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys and values that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive pairs. The final partitioning is striped. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * int thread_values[4]; - * ... - * - * // Collectively sort the keys and values among block threads - * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. - * - */ - __device__ __forceinline__ void SortBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - /** - * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive keys. The final partitioning is striped. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * ... - * - * // Collectively sort the keys - * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. - * - */ - __device__ __forceinline__ void SortDescendingBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - NullType values[ITEMS_PER_THREAD]; - - SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - /** - * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). - * - * \par - * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" - * more than one tile of values, simply perform a key-value sort of the keys paired - * with a temporary value array that enumerates the key indices. The reordered indices - * can then be used as a gather-vector for exchanging other associated tile data through - * shared memory. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys and values that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive pairs. The final partitioning is striped. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * int thread_values[4]; - * ... - * - * // Collectively sort the keys and values among block threads - * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. - * - */ - __device__ __forceinline__ void SortDescendingBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - //@} end member group - -}; - -/** - * \example example_block_radix_sort.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_raking_layout.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_raking_layout.cuh deleted file mode 100644 index ab6b710..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_raking_layout.cuh +++ /dev/null @@ -1,152 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. - */ - - -#pragma once - -#include "../util_macro.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) - * \ingroup BlockModule - * - * \par Overview - * This type facilitates a shared memory usage pattern where a block of CUDA - * threads places elements into shared memory and then reduces the active - * parallelism to one "raking" warp of threads for serially aggregating consecutive - * sequences of shared items. Padding is inserted to eliminate bank conflicts - * (for most data types). - * - * \tparam T The data type to be exchanged. - * \tparam BLOCK_THREADS The thread block size in threads. - * \tparam PTX_ARCH [optional] \ptxversion - */ -template < - typename T, - int BLOCK_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> -struct BlockRakingLayout -{ - //--------------------------------------------------------------------- - // Constants and type definitions - //--------------------------------------------------------------------- - - enum - { - /// The total number of elements that need to be cooperatively reduced - SHARED_ELEMENTS = BLOCK_THREADS, - - /// Maximum number of warp-synchronous raking threads - MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), - - /// Number of raking elements per warp-synchronous raking thread (rounded up) - SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, - - /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) - RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, - - /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) - HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), - - /// Degree of bank conflicts (e.g., 4-way) - CONFLICT_DEGREE = (HAS_CONFLICTS) ? - (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : - 1, - - /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load - USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2), - - /// Total number of elements in the raking grid - GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING), - - /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) - UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), - }; - - - /** - * \brief Shared memory storage type - */ - struct __align__(16) _TempStorage - { - T buff[BlockRakingLayout::GRID_ELEMENTS]; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /** - * \brief Returns the location for the calling thread to place data into the grid - */ - static __device__ __forceinline__ T* PlacementPtr( - TempStorage &temp_storage, - unsigned int linear_tid) - { - // Offset for partial - unsigned int offset = linear_tid; - - // Add in one padding element for every segment - if (USE_SEGMENT_PADDING > 0) - { - offset += offset / SEGMENT_LENGTH; - } - - // Incorporating a block of padding partials every shared memory segment - return temp_storage.Alias().buff + offset; - } - - - /** - * \brief Returns the location for the calling thread to begin sequential raking - */ - static __device__ __forceinline__ T* RakingPtr( - TempStorage &temp_storage, - unsigned int linear_tid) - { - return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING)); - } -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_reduce.cuh deleted file mode 100644 index a9de9e7..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_reduce.cuh +++ /dev/null @@ -1,607 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. - */ - -#pragma once - -#include "specializations/block_reduce_raking.cuh" -#include "specializations/block_reduce_raking_commutative_only.cuh" -#include "specializations/block_reduce_warp_reductions.cuh" -#include "../util_ptx.cuh" -#include "../util_type.cuh" -#include "../thread/thread_operators.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - - -/****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - -/** - * BlockReduceAlgorithm enumerates alternative algorithms for parallel - * reduction across a CUDA thread block. - */ -enum BlockReduceAlgorithm -{ - - /** - * \par Overview - * An efficient "raking" reduction algorithm that only supports commutative - * reduction operators (true for most operations, e.g., addition). - * - * \par - * Execution is comprised of three phases: - * -# Upsweep sequential reduction in registers (if threads contribute more - * than one input each). Threads in warps other than the first warp place - * their partial reductions into shared memory. - * -# Upsweep sequential reduction in shared memory. Threads within the first - * warp continue to accumulate by raking across segments of shared partial reductions - * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. - * - * \par - * \image html block_reduce.png - *

\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
- * - * \par Performance Considerations - * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE - * and is preferable when the reduction operator is commutative. This variant - * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall - * throughput across the GPU when suitably occupied. However, turn-around latency may be - * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable - * when the GPU is under-occupied. - */ - BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, - - - /** - * \par Overview - * An efficient "raking" reduction algorithm that supports commutative - * (e.g., addition) and non-commutative (e.g., string concatenation) reduction - * operators. \blocked. - * - * \par - * Execution is comprised of three phases: - * -# Upsweep sequential reduction in registers (if threads contribute more - * than one input each). Each thread then places the partial reduction - * of its item(s) into shared memory. - * -# Upsweep sequential reduction in shared memory. Threads within a - * single warp rake across segments of shared partial reductions. - * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. - * - * \par - * \image html block_reduce.png - *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
- * - * \par Performance Considerations - * - This variant performs more communication than BLOCK_REDUCE_RAKING - * and is only preferable when the reduction operator is non-commutative. This variant - * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall - * throughput across the GPU when suitably occupied. However, turn-around latency may be - * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable - * when the GPU is under-occupied. - */ - BLOCK_REDUCE_RAKING, - - - /** - * \par Overview - * A quick "tiled warp-reductions" reduction algorithm that supports commutative - * (e.g., addition) and non-commutative (e.g., string concatenation) reduction - * operators. - * - * \par - * Execution is comprised of four phases: - * -# Upsweep sequential reduction in registers (if threads contribute more - * than one input each). Each thread then places the partial reduction - * of its item(s) into shared memory. - * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style - * reduction within each warp. - * -# A propagation phase where the warp reduction outputs in each warp are - * updated with the aggregate from each preceding warp. - * - * \par - * \image html block_scan_warpscans.png - *
\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
- * - * \par Performance Considerations - * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING - * or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall - * throughput across the GPU. However turn-around latency may be lower and - * thus useful when the GPU is under-occupied. - */ - BLOCK_REDUCE_WARP_REDUCTIONS, -}; - - -/****************************************************************************** - * Block reduce - ******************************************************************************/ - -/** - * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png) - * \ingroup BlockModule - * - * \tparam T Data type being reduced - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a list of input elements. - * - \rowmajor - * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles: - * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) - * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) - * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) - * - * \par Performance Considerations - * - \granularity - * - Very efficient (only one synchronization barrier). - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Summation (vs. generic reduction) - * - \p BLOCK_THREADS is a multiple of the architecture's warp size - * - Every thread has a valid input (i.e., full vs. partial-tiles) - * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives - * - * \par A Simple Example - * \blockcollective{BlockReduce} - * \par - * The code snippet below illustrates a sum reduction of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data); - * - * \endcode - * - */ -template < - typename T, - int BLOCK_DIM_X, - BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockReduce -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - typedef BlockReduceWarpReductions WarpReductions; - typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; - typedef BlockReduceRaking Raking; - - /// Internal specialization type - typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS), - WarpReductions, - typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), - RakingCommutativeOnly, - Raking>::Type>::Type InternalBlockReduce; // BlockReduceRaking - - /// Shared memory storage layout type for BlockReduce - typedef typename InternalBlockReduce::TempStorage _TempStorage; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - -public: - - /// \smemstorage{BlockReduce} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockReduce() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockReduce( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Generic reductions - *********************************************************************/ - //@{ - - - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item - * int thread_data; - * ... - * - * // Compute the block-wide max for thread0 - * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); - * - * \endcode - * - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op) ///< [in] Binary reduction functor - { - return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); - } - - - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Compute the block-wide max for thread0 - * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); - * - * \endcode - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment - ReductionOp reduction_op) ///< [in] Binary reduction functor - { - // Reduce partials - T partial = internal::ThreadReduce(inputs, reduction_op); - return Reduce(partial, reduction_op); - } - - - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first \p num_valid threads each contribute one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction of a partially-full tile of integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int num_valid, ...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item - * int thread_data; - * if (threadIdx.x < num_valid) thread_data = ... - * - * // Compute the block-wide max for thread0 - * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); - * - * \endcode - * - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op, ///< [in] Binary reduction functor - int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) - { - // Determine if we scan skip bounds checking - if (num_valid >= BLOCK_THREADS) - { - return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); - } - else - { - return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); - } - } - - - //@} end member group - /******************************************************************//** - * \name Summation reductions - *********************************************************************/ - //@{ - - - /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item - * int thread_data; - * ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data); - * - * \endcode - * - */ - __device__ __forceinline__ T Sum( - T input) ///< [in] Calling thread's input - { - return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); - } - - /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data); - * - * \endcode - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ T Sum( - T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment - { - // Reduce partials - T partial = internal::ThreadReduce(inputs, cub::Sum()); - return Sum(partial); - } - - - /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int num_valid, ...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item (up to num_items) - * int thread_data; - * if (threadIdx.x < num_valid) - * thread_data = ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); - * - * \endcode - * - */ - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input - int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) - { - // Determine if we scan skip bounds checking - if (num_valid >= BLOCK_THREADS) - { - return InternalBlockReduce(temp_storage).template Sum(input, num_valid); - } - else - { - return InternalBlockReduce(temp_storage).template Sum(input, num_valid); - } - } - - - //@} end member group -}; - -/** - * \example example_block_reduce.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_scan.cuh deleted file mode 100644 index 245084c..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_scan.cuh +++ /dev/null @@ -1,2126 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. - */ - -#pragma once - -#include "specializations/block_scan_raking.cuh" -#include "specializations/block_scan_warp_scans.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_ptx.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - -/** - * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block. - */ -enum BlockScanAlgorithm -{ - - /** - * \par Overview - * An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: - * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. - * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions. - * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp. - * -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output. - * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. - * - * \par - * \image html block_scan_raking.png - *
\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
- * - * \par Performance Considerations - * - Although this variant may suffer longer turnaround latencies when the - * GPU is under-occupied, it can often provide higher overall throughput - * across the GPU when suitably occupied. - */ - BLOCK_SCAN_RAKING, - - - /** - * \par Overview - * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at - * the expense of higher register pressure. Raking threads preserve their - * "upsweep" segment of values in registers while performing warp-synchronous - * scan, allowing the "downsweep" not to re-read them from shared memory. - */ - BLOCK_SCAN_RAKING_MEMOIZE, - - - /** - * \par Overview - * A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: - * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. - * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. - * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp. - * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. - * - * \par - * \image html block_scan_warpscans.png - *
\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
- * - * \par Performance Considerations - * - Although this variant may suffer lower overall throughput across the - * GPU because due to a heavy reliance on inefficient warpscans, it can - * often provide lower turnaround latencies when the GPU is under-occupied. - */ - BLOCK_SCAN_WARP_SCANS, -}; - - -/****************************************************************************** - * Block scan - ******************************************************************************/ - -/** - * \brief The BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png) - * \ingroup BlockModule - * - * \tparam T Data type being scanned - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) - * produces an output list where each element is computed to be the reduction - * of the elements occurring earlier in the input list. Prefix sum - * connotes a prefix scan with the addition operator. The term \em inclusive indicates - * that the ith output reduction incorporates the ith input. - * The term \em exclusive indicates the ith input is not incorporated into - * the ith output reduction. - * - \rowmajor - * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: - * -# cub::BLOCK_SCAN_RAKING. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) - * -# cub::BLOCK_SCAN_RAKING_MEMOIZE. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm) - * -# cub::BLOCK_SCAN_WARP_SCANS. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) - * - * \par Performance Considerations - * - \granularity - * - Uses special instructions when applicable (e.g., warp \p SHFL) - * - Uses synchronization-free communication between warp lanes when applicable - * - Invokes a minimal number of minimal block-wide synchronization barriers (only - * one or two depending on algorithm selection) - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Prefix sum variants (vs. generic scan) - * - \blocksize - * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives - * - * \par A Simple Example - * \blockcollective{BlockScan} - * \par - * The code snippet below illustrates an exclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * {[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}. - * The corresponding output \p thread_data in those threads will be - * {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. - * - */ -template < - typename T, - int BLOCK_DIM_X, - BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockScan -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - /** - * Ensure the template parameterization meets the requirements of the - * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy - * cannot be used with thread block sizes not a multiple of the - * architectural warp size. - */ - static const BlockScanAlgorithm SAFE_ALGORITHM = - ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ? - BLOCK_SCAN_RAKING : - ALGORITHM; - - typedef BlockScanWarpScans WarpScans; - typedef BlockScanRaking Raking; - - /// Define the delegate type for the desired algorithm - typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS), - WarpScans, - Raking>::Type InternalBlockScan; - - /// Shared memory storage layout type for BlockScan - typedef typename InternalBlockScan::TempStorage _TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /****************************************************************************** - * Public types - ******************************************************************************/ -public: - - /// \smemstorage{BlockScan} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockScan() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockScan( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix sum operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. - * - * \par - * - \identityzero - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. - * - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output) ///< [out] Calling thread's output item (may be aliased to \p input) - { - T initial_value = 0; - ExclusiveScan(input, output, initial_value, cub::Sum()); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. - * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. - * - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - T initial_value = 0; - ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide exclusive prefix sum - * BlockScan(temp_storage).ExclusiveSum( - * thread_data, thread_data, prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 0, 1, ..., 127. - * The output for the second segment will be 128, 129, ..., 255. - * - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); - } - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix sum operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. - * - * \par - * - \identityzero - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) - { - T initial_value = 0; - ExclusiveScan(input, output, initial_value, cub::Sum()); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - // Reduce consecutive thread items in registers - T initial_value = 0; - ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) - * across 128 threads where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * CTA_SYNC(); - * - * // Collectively compute the block-wide exclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage.scan).ExclusiveSum( - * thread_data, thread_data, prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * CTA_SYNC(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 0, 1, 2, 3, ..., 510, 511. - * The output for the second segment will be 512, 513, 514, 515, ..., 1022, 1023. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); - } - - - - //@} end member group // Exclusive prefix sums - /******************************************************************//** - * \name Exclusive prefix scan operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op) ///< [in] Binary scan functor - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. - * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(INT_MIN); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide exclusive prefix max scan - * BlockScan(temp_storage).ExclusiveScan( - * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, ..., 124, 126. - * The output for the second segment will be 126, 128, 128, 130, ..., 252, 254. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op); - } - - - //@} end member group // Inclusive prefix sums - /******************************************************************//** - * \name Exclusive prefix scan operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. - * The corresponding output \p thread_data in those threads will be - * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op) ///< [in] Binary scan functor - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); - - // Exclusive scan in registers with prefix as seed - internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The - * corresponding output \p thread_data in those threads will be { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. - * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); - - // Exclusive scan in registers with prefix as seed - internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * CTA_SYNC(); - * - * // Collectively compute the block-wide exclusive prefix max scan - * BlockScan(temp_storage.scan).ExclusiveScan( - * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * CTA_SYNC(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510. - * The output for the second segment will be 510, 512, 512, 514, 514, 516, ..., 1020, 1022. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); - - // Exclusive scan in registers with prefix as seed - internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); - } - - - //@} end member group -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans - - /******************************************************************//** - * \name Exclusive prefix scan operations (no initial value, single datum per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); - } - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix scan operations (no initial value, multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no initial value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor - { - // Reduce consecutive thread items in registers - T thread_partial = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_partial, thread_partial, scan_op); - - // Exclusive scan in registers with prefix - internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - // Reduce consecutive thread items in registers - T thread_partial = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); - - // Exclusive scan in registers with prefix - internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); - } - - - //@} end member group -#endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans - - /******************************************************************//** - * \name Inclusive prefix sum operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. - * - * \par - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. - * - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output) ///< [out] Calling thread's output item (may be aliased to \p input) - { - InclusiveScan(input, output, cub::Sum()); - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. - * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. - * - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InclusiveScan(input, output, cub::Sum(), block_aggregate); - } - - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide inclusive prefix sum - * BlockScan(temp_storage).InclusiveSum( - * thread_data, thread_data, prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 1, 2, ..., 128. - * The output for the second segment will be 129, 130, ..., 256. - * - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix sum operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveSum(input[0], output[0]); - } - else - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveSum(thread_prefix, thread_prefix); - - // Inclusive scan in registers with prefix as seed - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be - * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. - * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveSum(input[0], output[0], block_aggregate); - } - else - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveSum(thread_prefix, thread_prefix, block_aggregate); - - // Inclusive scan in registers with prefix as seed - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) - * across 128 threads where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * CTA_SYNC(); - * - * // Collectively compute the block-wide inclusive prefix sum - * BlockScan(temp_storage.scan).IncluisveSum( - * thread_data, thread_data, prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * CTA_SYNC(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 1, 2, 3, 4, ..., 511, 512. - * The output for the second segment will be 513, 514, 515, 516, ..., 1023, 1024. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveSum(input[0], output[0], block_prefix_callback_op); - } - else - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op); - - // Inclusive scan in registers with prefix as seed - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); - } - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix scan operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor - { - InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. - * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(INT_MIN); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide inclusive prefix max scan - * BlockScan(temp_storage).InclusiveScan( - * thread_data, thread_data, cub::Max(), prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be 0, 0, 2, 2, ..., 126, 126. - * The output for the second segment will be 128, 128, 130, 130, ..., 254, 254. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix scan operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The - * corresponding output \p thread_data in those threads will be { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveScan(input[0], output[0], scan_op); - } - else - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, scan_op); - - // Inclusive scan in registers with prefix as seed (first thread does not seed) - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. - * The corresponding output \p thread_data in those threads will be - * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. - * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveScan(input[0], output[0], scan_op, block_aggregate); - } - else - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan (with no initial value) - ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate); - - // Inclusive scan in registers with prefix as seed (first thread does not seed) - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * CTA_SYNC(); - * - * // Collectively compute the block-wide inclusive prefix max scan - * BlockScan(temp_storage.scan).InclusiveScan( - * thread_data, thread_data, cub::Max(), prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * CTA_SYNC(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be 0, 0, 2, 2, 4, 4, ..., 510, 510. - * The output for the second segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op); - } - else - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); - - // Inclusive scan in registers with prefix as seed - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); - } - } - - //@} end member group - - -}; - -/** - * \example example_block_scan.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_shuffle.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_shuffle.cuh deleted file mode 100644 index 504f00e..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_shuffle.cuh +++ /dev/null @@ -1,305 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../util_arch.cuh" -#include "../util_ptx.cuh" -#include "../util_macro.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief The BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. - * \ingroup BlockModule - * - * \tparam T The data type to be exchanged. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * It is commonplace for blocks of threads to rearrange data items between - * threads. The BlockShuffle abstraction allows threads to efficiently shift items - * either (a) up to their successor or (b) down to their predecessor. - * - */ -template < - typename T, - int BLOCK_DIM_X, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockShuffle -{ -private: - - /****************************************************************************** - * Constants - ******************************************************************************/ - - enum - { - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - }; - - /****************************************************************************** - * Type definitions - ******************************************************************************/ - - /// Shared memory storage layout type (last element from each thread's input) - struct _TempStorage - { - T prev[BLOCK_THREADS]; - T next[BLOCK_THREADS]; - }; - - -public: - - /// \smemstorage{BlockShuffle} - struct TempStorage : Uninitialized<_TempStorage> {}; - -private: - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - -public: - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockShuffle() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockShuffle( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Shuffle movement - *********************************************************************/ - //@{ - - - /** - * \brief Each threadi obtains the \p input provided by threadi+distance. The offset \p distance may be negative. - * - * \par - * - \smemreuse - */ - __device__ __forceinline__ void Offset( - T input, ///< [in] The input item from the calling thread (threadi) - T& output, ///< [out] The \p input item from the successor (or predecessor) thread threadi+distance (may be aliased to \p input). This value is only updated for for threadi when 0 <= (i + \p distance) < BLOCK_THREADS-1 - int distance = 1) ///< [in] Offset distance (may be negative) - { - temp_storage[linear_tid].prev = input; - - CTA_SYNC(); - - if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS)) - output = temp_storage[linear_tid + distance].prev; - } - - - /** - * \brief Each threadi obtains the \p input provided by threadi+distance. - * - * \par - * - \smemreuse - */ - __device__ __forceinline__ void Rotate( - T input, ///< [in] The calling thread's input item - T& output, ///< [out] The \p input item from thread thread(i+distance>)% (may be aliased to \p input). This value is not updated for threadBLOCK_THREADS-1 - unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < BLOCK_THREADS) - { - temp_storage[linear_tid].prev = input; - - CTA_SYNC(); - - unsigned int offset = threadIdx.x + distance; - if (offset >= BLOCK_THREADS) - offset -= BLOCK_THREADS; - - output = temp_storage[offset].prev; - } - - - /** - * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - */ - template - __device__ __forceinline__ void Up( - T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items - T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. - { - temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) - prev[ITEM] = input[ITEM - 1]; - - - if (linear_tid > 0) - prev[0] = temp_storage[linear_tid - 1].prev; - } - - - /** - * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item. All threads receive the \p input provided by threadBLOCK_THREADS-1. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - */ - template - __device__ __forceinline__ void Up( - T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items - T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. - T &block_suffix) ///< [out] The item \p input[ITEMS_PER_THREAD-1] from threadBLOCK_THREADS-1, provided to all threads - { - Up(input, prev); - block_suffix = temp_storage[BLOCK_THREADS - 1].prev; - } - - - /** - * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it down by one item - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - */ - template - __device__ __forceinline__ void Down( - T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items - T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. - { - temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) - prev[ITEM] = input[ITEM - 1]; - - if (linear_tid > 0) - prev[0] = temp_storage[linear_tid - 1].prev; - } - - - /** - * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of input items, shifting it down by one item. All threads receive \p input[0] provided by thread0. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - */ - template - __device__ __forceinline__ void Down( - T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items - T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. - T &block_prefix) ///< [out] The item \p input[0] from thread0, provided to all threads - { - Up(input, prev); - block_prefix = temp_storage[BLOCK_THREADS - 1].prev; - } - - //@} end member group - - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_store.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_store.cuh deleted file mode 100644 index 63039af..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/block_store.cuh +++ /dev/null @@ -1,1000 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Operations for writing linear segments of data from the CUDA thread block - */ - -#pragma once - -#include - -#include "block_exchange.cuh" -#include "../util_ptx.cuh" -#include "../util_macro.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIo - * @{ - */ - - -/******************************************************************//** - * \name Blocked arrangement I/O (direct) - *********************************************************************/ -//@{ - -/** - * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. - * - * \blocked - * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store -{ - OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); - - // Store directly in thread-blocked order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - thread_itr[ITEM] = items[ITEM]; - } -} - - -/** - * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range - * - * \blocked - * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write -{ - OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); - - // Store directly in thread-blocked order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items) - { - thread_itr[ITEM] = items[ITEM]; - } - } -} - - -/** - * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. - * - * \blocked - * - * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned, - * which is the default starting offset returned by \p cudaMalloc() - * - * \par - * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) - * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * - */ -template < - typename T, - int ITEMS_PER_THREAD> -__device__ __forceinline__ void StoreDirectBlockedVectorized( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - T *block_ptr, ///< [in] Input pointer for storing from - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store -{ - enum - { - // Maximum CUDA vector size is 4 elements - MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD), - - // Vector size must be a power of two and an even divisor of the items per thread - VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? - MAX_VEC_SIZE : - 1, - - VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE, - }; - - // Vector type - typedef typename CubVector::Type Vector; - - // Alias global pointer - Vector *block_ptr_vectors = reinterpret_cast(const_cast(block_ptr)); - - // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling) - Vector raw_vector[VECTORS_PER_THREAD]; - T *raw_items = reinterpret_cast(raw_vector); - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - raw_items[ITEM] = items[ITEM]; - } - - // Direct-store using vector types - StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector); -} - - - -//@} end member group -/******************************************************************//** - * \name Striped arrangement I/O (direct) - *********************************************************************/ -//@{ - - -/** - * \brief Store a striped arrangement of data across the thread block into a linear segment of items. - * - * \striped - * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - int BLOCK_THREADS, - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store -{ - OutputIteratorT thread_itr = block_itr + linear_tid; - - // Store directly in striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; - } -} - - -/** - * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range - * - * \striped - * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - int BLOCK_THREADS, - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write -{ - OutputIteratorT thread_itr = block_itr + linear_tid; - - // Store directly in striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items) - { - thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; - } - } -} - - - -//@} end member group -/******************************************************************//** - * \name Warp-striped arrangement I/O (direct) - *********************************************************************/ -//@{ - - -/** - * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items. - * - * \warpstriped - * - * \par Usage Considerations - * The number of threads in the thread block must be a multiple of the architecture's warp size. - * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; - - OutputIteratorT thread_itr = block_itr + warp_offset + tid; - - // Store directly in warp-striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; - } -} - - -/** - * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range - * - * \warpstriped - * - * \par Usage Considerations - * The number of threads in the thread block must be a multiple of the architecture's warp size. - * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write -{ - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; - - OutputIteratorT thread_itr = block_itr + warp_offset + tid; - - // Store directly in warp-striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) - { - thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; - } - } -} - - -//@} end member group - - -/** @} */ // end group UtilIo - - -//----------------------------------------------------------------------------- -// Generic BlockStore abstraction -//----------------------------------------------------------------------------- - -/** - * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory. - */ -enum BlockStoreAlgorithm -{ - /** - * \par Overview - * - * A [blocked arrangement](index.html#sec5sec3) of data is written - * directly to memory. - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) decreases as the - * access stride between threads increases (i.e., the number items per thread). - */ - BLOCK_STORE_DIRECT, - - /** - * \par Overview - * - * A [blocked arrangement](index.html#sec5sec3) of data is written directly - * to memory using CUDA's built-in vectorized stores as a coalescing optimization. - * For example, st.global.v4.s32 instructions will be generated - * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high until the the - * access stride between threads (i.e., the number items per thread) exceeds the - * maximum vector store width (typically 4 items or 64B, whichever is lower). - * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The \p OutputIteratorT is not a simple pointer type - * - The block output offset is not quadword-aligned - * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) - */ - BLOCK_STORE_VECTORIZE, - - /** - * \par Overview - * A [blocked arrangement](index.html#sec5sec3) is locally - * transposed and then efficiently written to memory as a [striped arrangement](index.html#sec5sec3). - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items written per thread. - * - The local reordering incurs slightly longer latencies and throughput than the - * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. - */ - BLOCK_STORE_TRANSPOSE, - - /** - * \par Overview - * A [blocked arrangement](index.html#sec5sec3) is locally - * transposed and then efficiently written to memory as a - * [warp-striped arrangement](index.html#sec5sec3) - * - * \par Usage Considerations - * - BLOCK_THREADS must be a multiple of WARP_THREADS - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items written per thread. - * - The local reordering incurs slightly longer latencies and throughput than the - * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. - */ - BLOCK_STORE_WARP_TRANSPOSE, - - /** - * \par Overview - * A [blocked arrangement](index.html#sec5sec3) is locally - * transposed and then efficiently written to memory as a - * [warp-striped arrangement](index.html#sec5sec3) - * To reduce the shared memory requirement, only one warp's worth of shared - * memory is provisioned and is subsequently time-sliced among warps. - * - * \par Usage Considerations - * - BLOCK_THREADS must be a multiple of WARP_THREADS - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items written per thread. - * - Provisions less shared memory temporary storage, but incurs larger - * latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative. - */ - BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, - -}; - - -/** - * \brief The BlockStore class provides [collective](index.html#sec0) data movement methods for writing a [blocked arrangement](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory. ![](block_store_logo.png) - * \ingroup BlockModule - * \ingroup UtilIo - * - * \tparam T The type of data to be written. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. - * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. - * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - The BlockStore class provides a single data movement abstraction that can be specialized - * to implement different cub::BlockStoreAlgorithm strategies. This facilitates different - * performance policies for different architectures, data types, granularity sizes, etc. - * - BlockStore can be optionally specialized by different data movement strategies: - * -# cub::BLOCK_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) of data is written - * directly to memory. [More...](\ref cub::BlockStoreAlgorithm) - * -# cub::BLOCK_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) - * of data is written directly to memory using CUDA's built-in vectorized stores as a - * coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm) - * -# cub::BLOCK_STORE_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) - * is locally transposed into a [striped arrangement](index.html#sec5sec3) which is - * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) - * -# cub::BLOCK_STORE_WARP_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) - * is locally transposed into a [warp-striped arrangement](index.html#sec5sec3) which is - * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) - * - \rowmajor - * - * \par A Simple Example - * \blockcollective{BlockStore} - * \par - * The code snippet below illustrates the storing of a "blocked" arrangement - * of 512 integers across 128 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, - * meaning items are locally reordered among threads so that memory references will be - * efficiently coalesced using a warp-striped access pattern. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockStore BlockStore; - * - * // Allocate shared memory for BlockStore - * __shared__ typename BlockStore::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Store items to linear memory - * int thread_data[4]; - * BlockStore(temp_storage).Store(d_data, thread_data); - * - * \endcode - * \par - * Suppose the set of \p thread_data across the block of threads is - * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... - * - */ -template < - typename T, - int BLOCK_DIM_X, - int ITEMS_PER_THREAD, - BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockStore -{ -private: - /****************************************************************************** - * Constants and typed definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - - /****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - - /// Store helper - template - struct StoreInternal; - - - /** - * BLOCK_STORE_DIRECT specialization of store helper - */ - template - struct StoreInternal - { - /// Shared memory storage layout type - typedef NullType TempStorage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ StoreInternal( - TempStorage &/*temp_storage*/, - int linear_tid) - : - linear_tid(linear_tid) - {} - - /// Store items into a linear segment of memory - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - StoreDirectBlocked(linear_tid, block_itr, items); - } - - /// Store items into a linear segment of memory, guarded by range - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - StoreDirectBlocked(linear_tid, block_itr, items, valid_items); - } - }; - - - /** - * BLOCK_STORE_VECTORIZE specialization of store helper - */ - template - struct StoreInternal - { - /// Shared memory storage layout type - typedef NullType TempStorage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ StoreInternal( - TempStorage &/*temp_storage*/, - int linear_tid) - : - linear_tid(linear_tid) - {} - - /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization) - __device__ __forceinline__ void Store( - T *block_ptr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - StoreDirectBlockedVectorized(linear_tid, block_ptr, items); - } - - /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization) - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - StoreDirectBlocked(linear_tid, block_itr, items); - } - - /// Store items into a linear segment of memory, guarded by range - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - StoreDirectBlocked(linear_tid, block_itr, items, valid_items); - } - }; - - - /** - * BLOCK_STORE_TRANSPOSE specialization of store helper - */ - template - struct StoreInternal - { - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ StoreInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Store items into a linear segment of memory - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - BlockExchange(temp_storage).BlockedToStriped(items); - StoreDirectStriped(linear_tid, block_itr, items); - } - - /// Store items into a linear segment of memory, guarded by range - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - BlockExchange(temp_storage).BlockedToStriped(items); - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); - } - }; - - - /** - * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper - */ - template - struct StoreInternal - { - enum - { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) - }; - - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); - - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ StoreInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Store items into a linear segment of memory - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - BlockExchange(temp_storage).BlockedToWarpStriped(items); - StoreDirectWarpStriped(linear_tid, block_itr, items); - } - - /// Store items into a linear segment of memory, guarded by range - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - BlockExchange(temp_storage).BlockedToWarpStriped(items); - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); - } - }; - - - /** - * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper - */ - template - struct StoreInternal - { - enum - { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) - }; - - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); - - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ StoreInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Store items into a linear segment of memory - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - BlockExchange(temp_storage).BlockedToWarpStriped(items); - StoreDirectWarpStriped(linear_tid, block_itr, items); - } - - /// Store items into a linear segment of memory, guarded by range - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - BlockExchange(temp_storage).BlockedToWarpStriped(items); - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); - } - }; - - /****************************************************************************** - * Type definitions - ******************************************************************************/ - - /// Internal load implementation to use - typedef StoreInternal InternalStore; - - - /// Shared memory storage layout type - typedef typename InternalStore::TempStorage _TempStorage; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - -public: - - - /// \smemstorage{BlockStore} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockStore() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockStore( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Data movement - *********************************************************************/ - //@{ - - - /** - * \brief Store items into a linear segment of memory. - * - * \par - * - \blocked - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the storing of a "blocked" arrangement - * of 512 integers across 128 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, - * meaning items are locally reordered among threads so that memory references will be - * efficiently coalesced using a warp-striped access pattern. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockStore BlockStore; - * - * // Allocate shared memory for BlockStore - * __shared__ typename BlockStore::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Store items to linear memory - * int thread_data[4]; - * BlockStore(temp_storage).Store(d_data, thread_data); - * - * \endcode - * \par - * Suppose the set of \p thread_data across the block of threads is - * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... - * - */ - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - InternalStore(temp_storage, linear_tid).Store(block_itr, items); - } - - /** - * \brief Store items into a linear segment of memory, guarded by range. - * - * \par - * - \blocked - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the guarded storing of a "blocked" arrangement - * of 512 integers across 128 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, - * meaning items are locally reordered among threads so that memory references will be - * efficiently coalesced using a warp-striped access pattern. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, int valid_items, ...) - * { - * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockStore BlockStore; - * - * // Allocate shared memory for BlockStore - * __shared__ typename BlockStore::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Store items to linear memory - * int thread_data[4]; - * BlockStore(temp_storage).Store(d_data, thread_data, valid_items); - * - * \endcode - * \par - * Suppose the set of \p thread_data across the block of threads is - * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and \p valid_items is \p 5. - * The output \p d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with - * only the first two threads being unmasked to store portions of valid data. - * - */ - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_atomic.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_atomic.cuh deleted file mode 100644 index 4599c09..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_atomic.cuh +++ /dev/null @@ -1,82 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. - */ -template -struct BlockHistogramAtomic -{ - /// Shared memory storage layout type - struct TempStorage {}; - - - /// Constructor - __device__ __forceinline__ BlockHistogramAtomic( - TempStorage &temp_storage) - {} - - - /// Composite data onto an existing histogram - template < - typename T, - typename CounterT, - int ITEMS_PER_THREAD> - __device__ __forceinline__ void Composite( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram - { - // Update histogram - #pragma unroll - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { - atomicAdd(histogram + items[i], 1); - } - } - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_sort.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_sort.cuh deleted file mode 100644 index b9ad6fb..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_histogram_sort.cuh +++ /dev/null @@ -1,226 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../../block/block_radix_sort.cuh" -#include "../../block/block_discontinuity.cuh" -#include "../../util_ptx.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - - -/** - * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. - */ -template < - typename T, ///< Sample type - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int ITEMS_PER_THREAD, ///< The number of samples per thread - int BINS, ///< The number of bins into which histogram samples may fall - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockHistogramSort -{ - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - // Parameterize BlockRadixSort type for our thread block - typedef BlockRadixSort< - T, - BLOCK_DIM_X, - ITEMS_PER_THREAD, - NullType, - 4, - (PTX_ARCH >= 350) ? true : false, - BLOCK_SCAN_WARP_SCANS, - cudaSharedMemBankSizeFourByte, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - BlockRadixSortT; - - // Parameterize BlockDiscontinuity type for our thread block - typedef BlockDiscontinuity< - T, - BLOCK_DIM_X, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - BlockDiscontinuityT; - - /// Shared memory - union _TempStorage - { - // Storage for sorting bin values - typename BlockRadixSortT::TempStorage sort; - - struct - { - // Storage for detecting discontinuities in the tile of sorted bin values - typename BlockDiscontinuityT::TempStorage flag; - - // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values - unsigned int run_begin[BINS]; - unsigned int run_end[BINS]; - }; - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - - - /// Constructor - __device__ __forceinline__ BlockHistogramSort( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - // Discontinuity functor - struct DiscontinuityOp - { - // Reference to temp_storage - _TempStorage &temp_storage; - - // Constructor - __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : - temp_storage(temp_storage) - {} - - // Discontinuity predicate - __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index) - { - if (a != b) - { - // Note the begin/end offsets in shared storage - temp_storage.run_begin[b] = b_index; - temp_storage.run_end[a] = b_index; - - return true; - } - else - { - return false; - } - } - }; - - - // Composite data onto an existing histogram - template < - typename CounterT > - __device__ __forceinline__ void Composite( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram - { - enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; - - // Sort bytes in blocked arrangement - BlockRadixSortT(temp_storage.sort).Sort(items); - - CTA_SYNC(); - - // Initialize the shared memory's run_begin and run_end for each bin - int histo_offset = 0; - - #pragma unroll - for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) - { - temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; - temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; - } - // Finish up with guarded initialization if necessary - if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) - { - temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; - temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; - } - - CTA_SYNC(); - - int flags[ITEMS_PER_THREAD]; // unused - - // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile - DiscontinuityOp flag_op(temp_storage); - BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); - - // Update begin for first item - if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0; - - CTA_SYNC(); - - // Composite into histogram - histo_offset = 0; - - #pragma unroll - for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) - { - int thread_offset = histo_offset + linear_tid; - CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; - histogram[thread_offset] += count; - } - - // Finish up with guarded composition if necessary - if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) - { - int thread_offset = histo_offset + linear_tid; - CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; - histogram[thread_offset] += count; - } - } - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking.cuh deleted file mode 100644 index c2c2665..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking.cuh +++ /dev/null @@ -1,222 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. - */ - -#pragma once - -#include "../../block/block_raking_layout.cuh" -#include "../../warp/warp_reduce.cuh" -#include "../../thread/thread_reduce.cuh" -#include "../../util_ptx.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. - * - * Supports non-commutative binary reduction operators. Unlike commutative - * reduction operators (e.g., addition), the application of a non-commutative - * reduction operator (e.g, string concatenation) across a sequence of inputs must - * honor the relative ordering of items and partial reductions when applying the - * reduction operator. - * - * Compared to the implementation of BlockReduceRaking (which does not support - * non-commutative operators), this implementation requires a few extra - * rounds of inter-thread communication. - */ -template < - typename T, ///< Data type being reduced - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockReduceRaking -{ - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - /// Layout type for padded thread block raking grid - typedef BlockRakingLayout BlockRakingLayout; - - /// WarpReduce utility type - typedef typename WarpReduce::InternalWarpReduce WarpReduce; - - /// Constants - enum - { - /// Number of raking threads - RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, - - /// Number of raking elements per warp synchronous raking thread - SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, - - /// Cooperative work can be entirely warp synchronous - WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS), - - /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two - WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo::VALUE, - - /// Whether or not accesses into smem are unguarded - RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED, - - }; - - - /// Shared memory storage layout type - union _TempStorage - { - typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - - - /// Constructor - __device__ __forceinline__ BlockReduceRaking( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - template - __device__ __forceinline__ T RakingReduction( - ReductionOp reduction_op, ///< [in] Binary scan operator - T *raking_segment, - T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*iteration*/) - { - // Update partial if addend is in range - if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) - { - T addend = raking_segment[ITERATION]; - partial = reduction_op(partial, addend); - } - return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); - } - - template - __device__ __forceinline__ T RakingReduction( - ReductionOp /*reduction_op*/, ///< [in] Binary scan operator - T * /*raking_segment*/, - T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*iteration*/) - { - return partial; - } - - - - /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template < - bool IS_FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) - partial = WarpReduce(temp_storage.warp_storage).template Reduce( - partial, - num_valid, - reduction_op); - } - else - { - // Place partial into shared memory grid. - *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; - - CTA_SYNC(); - - // Reduce parallelism to one warp - if (linear_tid < RAKING_THREADS) - { - // Raking reduction in grid - T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - partial = raking_segment[0]; - - partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); - - partial = WarpReduce(temp_storage.warp_storage).template Reduce( - partial, - num_valid, - reduction_op); - - } - } - - return partial; - } - - - /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template - __device__ __forceinline__ T Sum( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - { - cub::Sum reduction_op; - - return Reduce(partial, num_valid, reduction_op); - } - - - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh deleted file mode 100644 index ee22946..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ /dev/null @@ -1,199 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. - */ - -#pragma once - -#include "block_reduce_raking.cuh" -#include "../../warp/warp_reduce.cuh" -#include "../../thread/thread_reduce.cuh" -#include "../../util_ptx.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size. - */ -template < - typename T, ///< Data type being reduced - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockReduceRakingCommutativeOnly -{ - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values - typedef BlockReduceRaking FallBack; - - /// Constants - enum - { - /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - - /// Whether or not to use fall-back - USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), - - /// Number of raking threads - RAKING_THREADS = WARP_THREADS, - - /// Number of threads actually sharing items with the raking threads - SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), - - /// Number of raking elements per warp synchronous raking thread - SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, - }; - - /// WarpReduce utility type - typedef WarpReduce WarpReduce; - - /// Layout type for padded thread block raking grid - typedef BlockRakingLayout BlockRakingLayout; - - /// Shared memory storage layout type - union _TempStorage - { - struct - { - typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid - }; - typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - - - /// Constructor - __device__ __forceinline__ BlockReduceRakingCommutativeOnly( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template - __device__ __forceinline__ T Sum( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - { - if (USE_FALLBACK || !FULL_TILE) - { - return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid); - } - else - { - // Place partial into shared memory grid - if (linear_tid >= RAKING_THREADS) - *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; - - CTA_SYNC(); - - // Reduce parallelism to one warp - if (linear_tid < RAKING_THREADS) - { - // Raking reduction in grid - T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - partial = internal::ThreadReduce(raking_segment, cub::Sum(), partial); - - // Warpscan - partial = WarpReduce(temp_storage.warp_storage).Sum(partial); - } - } - - return partial; - } - - - /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template < - bool FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - if (USE_FALLBACK || !FULL_TILE) - { - return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op); - } - else - { - // Place partial into shared memory grid - if (linear_tid >= RAKING_THREADS) - *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; - - CTA_SYNC(); - - // Reduce parallelism to one warp - if (linear_tid < RAKING_THREADS) - { - // Raking reduction in grid - T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - partial = internal::ThreadReduce(raking_segment, reduction_op, partial); - - // Warpscan - partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op); - } - } - - return partial; - } - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_warp_reductions.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_warp_reductions.cuh deleted file mode 100644 index 68495b4..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_reduce_warp_reductions.cuh +++ /dev/null @@ -1,222 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. - */ - -#pragma once - -#include "../../warp/warp_reduce.cuh" -#include "../../util_ptx.cuh" -#include "../../util_arch.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. - */ -template < - typename T, ///< Data type being reduced - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockReduceWarpReductions -{ - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - - /// Number of active warps - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - /// The logical warp size for warp reductions - LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), - - /// Whether or not the logical warp size evenly divides the thread block size - EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) - }; - - - /// WarpReduce utility type - typedef typename WarpReduce::InternalWarpReduce WarpReduce; - - - /// Shared memory storage layout type - struct _TempStorage - { - typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan - T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan - T block_prefix; ///< Shared prefix for the entire thread block - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - unsigned int warp_id; - unsigned int lane_id; - - - /// Constructor - __device__ __forceinline__ BlockReduceWarpReductions( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - lane_id(LaneId()) - {} - - - template - __device__ __forceinline__ T ApplyWarpAggregates( - ReductionOp reduction_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*successor_warp*/) - { - if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) - { - T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP]; - warp_aggregate = reduction_op(warp_aggregate, addend); - } - return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type()); - } - - template - __device__ __forceinline__ T ApplyWarpAggregates( - ReductionOp /*reduction_op*/, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*successor_warp*/) - { - return warp_aggregate; - } - - - /// Returns block-wide aggregate in thread0. - template < - bool FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T ApplyWarpAggregates( - ReductionOp reduction_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - { - // Share lane aggregates - if (lane_id == 0) - { - temp_storage.warp_aggregates[warp_id] = warp_aggregate; - } - - CTA_SYNC(); - - // Update total aggregate in warp 0, lane 0 - if (linear_tid == 0) - { - warp_aggregate = ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type<1>()); - } - - return warp_aggregate; - } - - - /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input partial reductions - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - { - cub::Sum reduction_op; - unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; - unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? - LOGICAL_WARP_SIZE : - (warp_offset < num_valid) ? - num_valid - warp_offset : - 0; - - // Warp reduction in every warp - T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( - input, - warp_num_valid, - cub::Sum()); - - // Update outputs and block_aggregate with warp-wide aggregates from lane-0s - return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); - } - - - /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template < - bool FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input partial reductions - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; - unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? - LOGICAL_WARP_SIZE : - (warp_offset < static_cast(num_valid)) ? - num_valid - warp_offset : - 0; - - // Warp reduction in every warp - T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( - input, - warp_num_valid, - reduction_op); - - // Update outputs and block_aggregate with warp-wide aggregates from lane-0s - return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_raking.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_raking.cuh deleted file mode 100644 index 2e21324..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_raking.cuh +++ /dev/null @@ -1,666 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - - -/** - * \file - * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. - */ - -#pragma once - -#include "../../util_ptx.cuh" -#include "../../util_arch.cuh" -#include "../../block/block_raking_layout.cuh" -#include "../../thread/thread_reduce.cuh" -#include "../../thread/thread_scan.cuh" -#include "../../warp/warp_scan.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. - */ -template < - typename T, ///< Data type being scanned - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockScanRaking -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - /// Layout type for padded thread block raking grid - typedef BlockRakingLayout BlockRakingLayout; - - /// Constants - enum - { - /// Number of raking threads - RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, - - /// Number of raking elements per warp synchronous raking thread - SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, - - /// Cooperative work can be entirely warp synchronous - WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS), - }; - - /// WarpScan utility type - typedef WarpScan WarpScan; - - /// Shared memory storage layout type - struct _TempStorage - { - typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid - T block_aggregate; ///< Block aggregate - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - T cached_segment[SEGMENT_LENGTH]; - - - //--------------------------------------------------------------------- - // Utility methods - //--------------------------------------------------------------------- - - /// Templated reduction - template - __device__ __forceinline__ T GuardedReduce( - T* raking_ptr, ///< [in] Input array - ScanOp scan_op, ///< [in] Binary reduction operator - T raking_partial, ///< [in] Prefix to seed reduction with - Int2Type /*iteration*/) - { - if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) - { - T addend = raking_ptr[ITERATION]; - raking_partial = scan_op(raking_partial, addend); - } - - return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type()); - } - - - /// Templated reduction (base case) - template - __device__ __forceinline__ T GuardedReduce( - T* /*raking_ptr*/, ///< [in] Input array - ScanOp /*scan_op*/, ///< [in] Binary reduction operator - T raking_partial, ///< [in] Prefix to seed reduction with - Int2Type /*iteration*/) - { - return raking_partial; - } - - - /// Templated copy - template - __device__ __forceinline__ void CopySegment( - T* out, ///< [out] Out array - T* in, ///< [in] Input array - Int2Type /*iteration*/) - { - out[ITERATION] = in[ITERATION]; - CopySegment(out, in, Int2Type()); - } - - - /// Templated copy (base case) - __device__ __forceinline__ void CopySegment( - T* /*out*/, ///< [out] Out array - T* /*in*/, ///< [in] Input array - Int2Type /*iteration*/) - {} - - - /// Performs upsweep raking reduction, returning the aggregate - template - __device__ __forceinline__ T Upsweep( - ScanOp scan_op) - { - T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - - // Read data into registers - CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); - - T raking_partial = cached_segment[0]; - - return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>()); - } - - - /// Performs exclusive downsweep raking scan - template - __device__ __forceinline__ void ExclusiveDownsweep( - ScanOp scan_op, - T raking_partial, - bool apply_prefix = true) - { - T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - - // Read data back into registers - if (!MEMOIZE) - { - CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); - } - - internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); - - // Write data back to smem - CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); - } - - - /// Performs inclusive downsweep raking scan - template - __device__ __forceinline__ void InclusiveDownsweep( - ScanOp scan_op, - T raking_partial, - bool apply_prefix = true) - { - T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - - // Read data back into registers - if (!MEMOIZE) - { - CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); - } - - internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); - - // Write data back to smem - CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); - } - - - //--------------------------------------------------------------------- - // Constructors - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ BlockScanRaking( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //--------------------------------------------------------------------- - // Exclusive scans - //--------------------------------------------------------------------- - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Warp-synchronous scan - T exclusive_partial; - WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); - - // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - exclusive_output = *placement_ptr; - } - } - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Exclusive Warp-synchronous scan - T exclusive_partial; - WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op); - - // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, exclusive_partial); - } - - CTA_SYNC(); - - // Grab exclusive partial from shared memory - output = *placement_ptr; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial= Upsweep(scan_op); - - // Warp-synchronous scan - T inclusive_partial; - T exclusive_partial; - WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); - - // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); - - // Broadcast aggregate to all threads - if (linear_tid == RAKING_THREADS - 1) - temp_storage.block_aggregate = inclusive_partial; - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Warp-synchronous scan - T exclusive_partial; - WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate); - - // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, exclusive_partial); - - // Broadcast aggregate to other threads - if (linear_tid == 0) - temp_storage.block_aggregate = block_aggregate; - } - - CTA_SYNC(); - - // Grab exclusive partial from shared memory - output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - T block_aggregate; - WarpScan warp_scan(temp_storage.warp_scan); - warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate); - - // Obtain warp-wide prefix in lane0, then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = warp_scan.Broadcast(block_prefix, 0); - - output = scan_op(block_prefix, output); - if (linear_tid == 0) - output = block_prefix; - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - WarpScan warp_scan(temp_storage.warp_scan); - - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Warp-synchronous scan - T exclusive_partial, block_aggregate; - warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); - - // Obtain block-wide prefix in lane0, then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = warp_scan.Broadcast(block_prefix, 0); - - // Update prefix with warpscan exclusive partial - T downsweep_prefix = scan_op(block_prefix, exclusive_partial); - if (linear_tid == 0) - downsweep_prefix = block_prefix; - - // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, downsweep_prefix); - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - } - } - - - //--------------------------------------------------------------------- - // Inclusive scans - //--------------------------------------------------------------------- - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Exclusive Warp-synchronous scan - T exclusive_partial; - WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); - - // Inclusive raking downsweep scan - InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - } - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Warp-synchronous scan - T inclusive_partial; - T exclusive_partial; - WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); - - // Inclusive raking downsweep scan - InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); - - // Broadcast aggregate to all threads - if (linear_tid == RAKING_THREADS - 1) - temp_storage.block_aggregate = inclusive_partial; - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - } - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - T block_aggregate; - WarpScan warp_scan(temp_storage.warp_scan); - warp_scan.InclusiveScan(input, output, scan_op, block_aggregate); - - // Obtain warp-wide prefix in lane0, then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = warp_scan.Broadcast(block_prefix, 0); - - // Update prefix with exclusive warpscan partial - output = scan_op(block_prefix, output); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - WarpScan warp_scan(temp_storage.warp_scan); - - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Warp-synchronous scan - T exclusive_partial, block_aggregate; - warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); - - // Obtain block-wide prefix in lane0, then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = warp_scan.Broadcast(block_prefix, 0); - - // Update prefix with warpscan exclusive partial - T downsweep_prefix = scan_op(block_prefix, exclusive_partial); - if (linear_tid == 0) - downsweep_prefix = block_prefix; - - // Inclusive raking downsweep scan - InclusiveDownsweep(scan_op, downsweep_prefix); - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - } - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans.cuh deleted file mode 100644 index 9252c0a..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans.cuh +++ /dev/null @@ -1,392 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ - -#pragma once - -#include "../../util_arch.cuh" -#include "../../util_ptx.cuh" -#include "../../warp/warp_scan.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ -template < - typename T, - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockScanWarpScans -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - /// Number of active warps - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - }; - - /// WarpScan utility type - typedef WarpScan WarpScanT; - - /// WarpScan utility type - typedef WarpScan WarpAggregateScan; - - /// Shared memory storage layout type - - struct __align__(32) _TempStorage - { - T warp_aggregates[WARPS]; - typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans - T block_prefix; ///< Shared prefix for the entire thread block - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - unsigned int warp_id; - unsigned int lane_id; - - - //--------------------------------------------------------------------- - // Constructors - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ BlockScanWarpScans( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - lane_id(LaneId()) - {} - - - //--------------------------------------------------------------------- - // Utility methods - //--------------------------------------------------------------------- - - template - __device__ __forceinline__ void ApplyWarpAggregates( - T &warp_prefix, ///< [out] The calling thread's partial reduction - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - Int2Type /*addend_warp*/) - { - if (warp_id == WARP) - warp_prefix = block_aggregate; - - T addend = temp_storage.warp_aggregates[WARP]; - block_aggregate = scan_op(block_aggregate, addend); - - ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); - } - - template - __device__ __forceinline__ void ApplyWarpAggregates( - T &/*warp_prefix*/, ///< [out] The calling thread's partial reduction - ScanOp /*scan_op*/, ///< [in] Binary scan operator - T &/*block_aggregate*/, ///< [out] Threadblock-wide aggregate reduction of input items - Int2Type /*addend_warp*/) - {} - - - /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. - template - __device__ __forceinline__ T ComputeWarpPrefix( - ScanOp scan_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Last lane in each warp shares its warp-aggregate - if (lane_id == WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = warp_aggregate; - - CTA_SYNC(); - - // Accumulate block aggregates and save the one that is our warp's prefix - T warp_prefix; - block_aggregate = temp_storage.warp_aggregates[0]; - - // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) - ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); -/* - #pragma unroll - for (int WARP = 1; WARP < WARPS; ++WARP) - { - if (warp_id == WARP) - warp_prefix = block_aggregate; - - T addend = temp_storage.warp_aggregates[WARP]; - block_aggregate = scan_op(block_aggregate, addend); - } -*/ - - return warp_prefix; - } - - - /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. - template - __device__ __forceinline__ T ComputeWarpPrefix( - ScanOp scan_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - const T &initial_value) ///< [in] Initial value to seed the exclusive scan - { - T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); - - warp_prefix = scan_op(initial_value, warp_prefix); - - if (warp_id == 0) - warp_prefix = initial_value; - - return warp_prefix; - } - - //--------------------------------------------------------------------- - // Exclusive scans - //--------------------------------------------------------------------- - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. - T block_aggregate; - ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. - T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); - - // Apply warp prefix to our lane's partial - if (warp_id != 0) - { - exclusive_output = scan_op(warp_prefix, exclusive_output); - if (lane_id == 0) - exclusive_output = warp_prefix; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp - T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); - - // Apply warp prefix to our lane's partial - exclusive_output = scan_op(warp_prefix, exclusive_output); - if (lane_id == 0) - exclusive_output = warp_prefix; - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. - T block_aggregate; - ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); - - // Use the first warp to determine the thread block prefix, returning the result in lane0 - if (warp_id == 0) - { - T block_prefix = block_prefix_callback_op(block_aggregate); - if (lane_id == 0) - { - // Share the prefix with all threads - temp_storage.block_prefix = block_prefix; - exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 - } - } - - CTA_SYNC(); - - // Incorporate thread block prefix into outputs - T block_prefix = temp_storage.block_prefix; - if (linear_tid > 0) - { - exclusive_output = scan_op(block_prefix, exclusive_output); - } - } - - - //--------------------------------------------------------------------- - // Inclusive scans - //--------------------------------------------------------------------- - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - InclusiveScan(input, inclusive_output, scan_op, block_aggregate); - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. - T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); - - // Apply warp prefix to our lane's partial - if (warp_id != 0) - { - inclusive_output = scan_op(warp_prefix, inclusive_output); - } - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - T block_aggregate; - InclusiveScan(input, exclusive_output, scan_op, block_aggregate); - - // Use the first warp to determine the thread block prefix, returning the result in lane0 - if (warp_id == 0) - { - T block_prefix = block_prefix_callback_op(block_aggregate); - if (lane_id == 0) - { - // Share the prefix with all threads - temp_storage.block_prefix = block_prefix; - } - } - - CTA_SYNC(); - - // Incorporate thread block prefix into outputs - T block_prefix = temp_storage.block_prefix; - exclusive_output = scan_op(block_prefix, exclusive_output); - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans2.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans2.cuh deleted file mode 100644 index eb0a3a1..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans2.cuh +++ /dev/null @@ -1,436 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ - -#pragma once - -#include "../../util_arch.cuh" -#include "../../util_ptx.cuh" -#include "../../warp/warp_scan.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ -template < - typename T, - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockScanWarpScans -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - /// Number of active warps - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - }; - - /// WarpScan utility type - typedef WarpScan WarpScanT; - - /// WarpScan utility type - typedef WarpScan WarpAggregateScanT; - - /// Shared memory storage layout type - struct _TempStorage - { - typename WarpAggregateScanT::TempStorage inner_scan[WARPS]; ///< Buffer for warp-synchronous scans - typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans - T warp_aggregates[WARPS]; - T block_prefix; ///< Shared prefix for the entire thread block - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - unsigned int warp_id; - unsigned int lane_id; - - - //--------------------------------------------------------------------- - // Constructors - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ BlockScanWarpScans( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - lane_id(LaneId()) - {} - - - //--------------------------------------------------------------------- - // Utility methods - //--------------------------------------------------------------------- - - template - __device__ __forceinline__ void ApplyWarpAggregates( - T &warp_prefix, ///< [out] The calling thread's partial reduction - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - Int2Type addend_warp) - { - if (warp_id == WARP) - warp_prefix = block_aggregate; - - T addend = temp_storage.warp_aggregates[WARP]; - block_aggregate = scan_op(block_aggregate, addend); - - ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); - } - - template - __device__ __forceinline__ void ApplyWarpAggregates( - T &warp_prefix, ///< [out] The calling thread's partial reduction - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - Int2Type addend_warp) - {} - - - /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. - template - __device__ __forceinline__ T ComputeWarpPrefix( - ScanOp scan_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Last lane in each warp shares its warp-aggregate - if (lane_id == WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = warp_aggregate; - - CTA_SYNC(); - - // Accumulate block aggregates and save the one that is our warp's prefix - T warp_prefix; - block_aggregate = temp_storage.warp_aggregates[0]; - - // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) - ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); -/* - #pragma unroll - for (int WARP = 1; WARP < WARPS; ++WARP) - { - if (warp_id == WARP) - warp_prefix = block_aggregate; - - T addend = temp_storage.warp_aggregates[WARP]; - block_aggregate = scan_op(block_aggregate, addend); - } -*/ - - return warp_prefix; - } - - - /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. - template - __device__ __forceinline__ T ComputeWarpPrefix( - ScanOp scan_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - const T &initial_value) ///< [in] Initial value to seed the exclusive scan - { - T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); - - warp_prefix = scan_op(initial_value, warp_prefix); - - if (warp_id == 0) - warp_prefix = initial_value; - - return warp_prefix; - } - - //--------------------------------------------------------------------- - // Exclusive scans - //--------------------------------------------------------------------- - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. - T block_aggregate; - ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); - - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. -// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); - -//-------------------------------------------------- - // Last lane in each warp shares its warp-aggregate - if (lane_id == WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - // Get the warp scan partial - T warp_inclusive, warp_prefix; - if (lane_id < WARPS) - { - // Scan the warpscan partials - T warp_val = temp_storage.warp_aggregates[lane_id]; - WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op); - } - - warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); - block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); -//-------------------------------------------------- - - // Apply warp prefix to our lane's partial - if (warp_id != 0) - { - exclusive_output = scan_op(warp_prefix, exclusive_output); - if (lane_id == 0) - exclusive_output = warp_prefix; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); - - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp -// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); - -//-------------------------------------------------- - // Last lane in each warp shares its warp-aggregate - if (lane_id == WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - // Get the warp scan partial - T warp_inclusive, warp_prefix; - if (lane_id < WARPS) - { - // Scan the warpscan partials - T warp_val = temp_storage.warp_aggregates[lane_id]; - WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op); - } - - warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); - block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); -//-------------------------------------------------- - - // Apply warp prefix to our lane's partial - exclusive_output = scan_op(warp_prefix, exclusive_output); - if (lane_id == 0) - exclusive_output = warp_prefix; - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. - T block_aggregate; - ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); - - // Use the first warp to determine the thread block prefix, returning the result in lane0 - if (warp_id == 0) - { - T block_prefix = block_prefix_callback_op(block_aggregate); - if (lane_id == 0) - { - // Share the prefix with all threads - temp_storage.block_prefix = block_prefix; - exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 - } - } - - CTA_SYNC(); - - // Incorporate thread block prefix into outputs - T block_prefix = temp_storage.block_prefix; - if (linear_tid > 0) - { - exclusive_output = scan_op(block_prefix, exclusive_output); - } - } - - - //--------------------------------------------------------------------- - // Inclusive scans - //--------------------------------------------------------------------- - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - InclusiveScan(input, inclusive_output, scan_op, block_aggregate); - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. - T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); - - // Apply warp prefix to our lane's partial - if (warp_id != 0) - { - inclusive_output = scan_op(warp_prefix, inclusive_output); - } - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - T block_aggregate; - InclusiveScan(input, exclusive_output, scan_op, block_aggregate); - - // Use the first warp to determine the thread block prefix, returning the result in lane0 - if (warp_id == 0) - { - T block_prefix = block_prefix_callback_op(block_aggregate); - if (lane_id == 0) - { - // Share the prefix with all threads - temp_storage.block_prefix = block_prefix; - } - } - - CTA_SYNC(); - - // Incorporate thread block prefix into outputs - T block_prefix = temp_storage.block_prefix; - exclusive_output = scan_op(block_prefix, exclusive_output); - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans3.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans3.cuh deleted file mode 100644 index 18bd585..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/block/specializations/block_scan_warp_scans3.cuh +++ /dev/null @@ -1,418 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ - -#pragma once - -#include "../../util_arch.cuh" -#include "../../util_ptx.cuh" -#include "../../warp/warp_scan.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ -template < - typename T, - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockScanWarpScans -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - /// Number of warp threads - INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS, - - /// Number of outer scan warps - OUTER_WARPS = INNER_WARP_THREADS - }; - - /// Outer WarpScan utility type - typedef WarpScan OuterWarpScanT; - - /// Inner WarpScan utility type - typedef WarpScan InnerWarpScanT; - - typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS]; - - - /// Shared memory storage layout type - struct _TempStorage - { - union Aliasable - { - Uninitialized outer_warp_scan; ///< Buffer for warp-synchronous outer scans - typename InnerWarpScanT::TempStorage inner_warp_scan; ///< Buffer for warp-synchronous inner scan - - } aliasable; - - T warp_aggregates[OUTER_WARPS]; - - T block_aggregate; ///< Shared prefix for the entire thread block - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - unsigned int warp_id; - unsigned int lane_id; - - - //--------------------------------------------------------------------- - // Constructors - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ BlockScanWarpScans( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS), - lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS) - {} - - - //--------------------------------------------------------------------- - // Exclusive scans - //--------------------------------------------------------------------- - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. - T block_aggregate; - ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( - input, inclusive_output, exclusive_output, scan_op); - - // Share outer warp total - if (lane_id == OUTER_WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - if (linear_tid < INNER_WARP_THREADS) - { - T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; - T outer_warp_exclusive; - - InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( - outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); - - temp_storage.block_aggregate = block_aggregate; - temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; - } - - CTA_SYNC(); - - if (warp_id != 0) - { - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - - // Apply warp prefix to our lane's partial - T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; - exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); - if (lane_id == 0) - exclusive_output = outer_warp_exclusive; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( - input, inclusive_output, exclusive_output, scan_op); - - // Share outer warp total - if (lane_id == OUTER_WARP_THREADS - 1) - { - temp_storage.warp_aggregates[warp_id] = inclusive_output; - } - - CTA_SYNC(); - - if (linear_tid < INNER_WARP_THREADS) - { - T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; - T outer_warp_exclusive; - - InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( - outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate); - - temp_storage.block_aggregate = block_aggregate; - temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; - } - - CTA_SYNC(); - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - - // Apply warp prefix to our lane's partial - T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; - exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); - if (lane_id == 0) - exclusive_output = outer_warp_exclusive; - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( - input, inclusive_output, exclusive_output, scan_op); - - // Share outer warp total - if (lane_id == OUTER_WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - if (linear_tid < INNER_WARP_THREADS) - { - InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); - - T upsweep = temp_storage.warp_aggregates[linear_tid]; - T downsweep_prefix, block_aggregate; - - inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); - - // Use callback functor to get block prefix in lane0 and then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = inner_scan.Broadcast(block_prefix, 0); - - downsweep_prefix = scan_op(block_prefix, downsweep_prefix); - if (linear_tid == 0) - downsweep_prefix = block_prefix; - - temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; - } - - CTA_SYNC(); - - // Apply warp prefix to our lane's partial (or assign it if partial is invalid) - T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; - exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); - if (lane_id == 0) - exclusive_output = outer_warp_exclusive; - } - - - //--------------------------------------------------------------------- - // Inclusive scans - //--------------------------------------------------------------------- - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - InclusiveScan(input, inclusive_output, scan_op, block_aggregate); - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( - input, inclusive_output, scan_op); - - // Share outer warp total - if (lane_id == OUTER_WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - if (linear_tid < INNER_WARP_THREADS) - { - T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; - T outer_warp_exclusive; - - InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( - outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); - - temp_storage.block_aggregate = block_aggregate; - temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; - } - - CTA_SYNC(); - - if (warp_id != 0) - { - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - - // Apply warp prefix to our lane's partial - T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; - inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); - } - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( - input, inclusive_output, scan_op); - - // Share outer warp total - if (lane_id == OUTER_WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - if (linear_tid < INNER_WARP_THREADS) - { - InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); - - T upsweep = temp_storage.warp_aggregates[linear_tid]; - T downsweep_prefix, block_aggregate; - inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); - - // Use callback functor to get block prefix in lane0 and then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = inner_scan.Broadcast(block_prefix, 0); - - downsweep_prefix = scan_op(block_prefix, downsweep_prefix); - if (linear_tid == 0) - downsweep_prefix = block_prefix; - - temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; - } - - CTA_SYNC(); - - // Apply warp prefix to our lane's partial - T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; - inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/cub.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/cub.cuh deleted file mode 100644 index b1c8e32..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/cub.cuh +++ /dev/null @@ -1,95 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * CUB umbrella include file - */ - -#pragma once - - -// Block -#include "block/block_histogram.cuh" -#include "block/block_discontinuity.cuh" -#include "block/block_exchange.cuh" -#include "block/block_load.cuh" -#include "block/block_radix_rank.cuh" -#include "block/block_radix_sort.cuh" -#include "block/block_reduce.cuh" -#include "block/block_scan.cuh" -#include "block/block_store.cuh" -//#include "block/block_shift.cuh" - -// Device -#include "device/device_histogram.cuh" -#include "device/device_partition.cuh" -#include "device/device_radix_sort.cuh" -#include "device/device_reduce.cuh" -#include "device/device_run_length_encode.cuh" -#include "device/device_scan.cuh" -#include "device/device_segmented_radix_sort.cuh" -#include "device/device_segmented_reduce.cuh" -#include "device/device_select.cuh" -#include "device/device_spmv.cuh" - -// Grid -//#include "grid/grid_barrier.cuh" -#include "grid/grid_even_share.cuh" -#include "grid/grid_mapping.cuh" -#include "grid/grid_queue.cuh" - -// Thread -#include "thread/thread_load.cuh" -#include "thread/thread_operators.cuh" -#include "thread/thread_reduce.cuh" -#include "thread/thread_scan.cuh" -#include "thread/thread_store.cuh" - -// Warp -#include "warp/warp_reduce.cuh" -#include "warp/warp_scan.cuh" - -// Iterator -#include "iterator/arg_index_input_iterator.cuh" -#include "iterator/cache_modified_input_iterator.cuh" -#include "iterator/cache_modified_output_iterator.cuh" -#include "iterator/constant_input_iterator.cuh" -#include "iterator/counting_input_iterator.cuh" -#include "iterator/tex_obj_input_iterator.cuh" -#include "iterator/tex_ref_input_iterator.cuh" -#include "iterator/transform_input_iterator.cuh" - -// Util -#include "util_arch.cuh" -#include "util_debug.cuh" -#include "util_device.cuh" -#include "util_macro.cuh" -#include "util_ptx.cuh" -#include "util_type.cuh" - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_histogram.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_histogram.cuh deleted file mode 100644 index db131ee..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_histogram.cuh +++ /dev/null @@ -1,866 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. - */ - -#pragma once - -#include -#include -#include - -#include "dispatch/dispatch_histogram.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png) - * \ingroup SingleModule - * - * \par Overview - * A histogram - * counts the number of observations that fall into each of the disjoint categories (known as bins). - * - * \par Usage Considerations - * \cdp_class{DeviceHistogram} - * - */ -struct DeviceHistogram -{ - /******************************************************************//** - * \name Evenly-segmented bin ranges - *********************************************************************/ - //@{ - - /** - * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. - * - * \par - * - The number of histogram bins is (\p num_levels - 1) - * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of a six-bin histogram - * from a sequence of float samples - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples and - * // output histogram - * int num_samples; // e.g., 10 - * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] - * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] - * int num_levels; // e.g., 7 (seven level boundaries for six bins) - * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) - * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); - * - * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; - * - * \endcode - * - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t HistogramEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. - CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. - int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. - LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. - LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. - OffsetT num_samples, ///< [in] The number of input samples (i.e., the length of \p d_samples) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - CounterT* d_histogram1[1] = {d_histogram}; - int num_levels1[1] = {num_levels}; - LevelT lower_level1[1] = {lower_level}; - LevelT upper_level1[1] = {upper_level}; - - return MultiHistogramEven<1, 1>( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram1, - num_levels1, - lower_level1, - upper_level1, - num_samples, - 1, - sizeof(SampleT) * num_samples, - stream, - debug_synchronous); - } - - - /** - * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. - * - * \par - * - A two-dimensional region of interest within \p d_samples can be specified - * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. - * - The row stride must be a whole multiple of the sample data type - * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. - * - The number of histogram bins is (\p num_levels - 1) - * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of a six-bin histogram - * from a 2x5 region of interest within a flattened 2x7 array of float samples. - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples and - * // output histogram - * int num_row_samples; // e.g., 5 - * int num_rows; // e.g., 2; - * size_t row_stride_bytes; // e.g., 7 * sizeof(float) - * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, - * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] - * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] - * int num_levels; // e.g., 7 (seven level boundaries for six bins) - * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) - * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_samples, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_samples, num_rows, row_stride_bytes); - * - * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; - * - * \endcode - * - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t HistogramEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. - CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. - int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. - LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. - LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. - OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - CounterT* d_histogram1[1] = {d_histogram}; - int num_levels1[1] = {num_levels}; - LevelT lower_level1[1] = {lower_level}; - LevelT upper_level1[1] = {upper_level}; - - return MultiHistogramEven<1, 1>( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram1, - num_levels1, - lower_level1, - upper_level1, - num_row_samples, - num_rows, - row_stride_bytes, - stream, - debug_synchronous); - } - - /** - * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. - * - * \par - * - The input is a sequence of pixel structures, where each pixel comprises - * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). - * - Of the \p NUM_CHANNELS specified, the function will only compute histograms - * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA - * pixel samples). - * - The number of histogram bins for channeli is num_levels[i] - 1. - * - For channeli, the range of values for all histogram bins - * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of three 256-bin RGB histograms - * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples - * // and output histograms - * int num_pixels; // e.g., 5 - * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), - * // (0, 6, 7, 5), (3, 0, 2, 6)] - * int* d_histogram[3]; // e.g., three device pointers to three device buffers, - * // each allocated with 256 integer counters - * int num_levels[3]; // e.g., {257, 257, 257}; - * unsigned int lower_level[3]; // e.g., {0, 0, 0}; - * unsigned int upper_level[3]; // e.g., {256, 256, 256}; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); - * - * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], - * // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], - * // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] - * - * \endcode - * - * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - int NUM_CHANNELS, - int NUM_ACTIVE_CHANNELS, - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t MultiHistogramEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. - int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. - LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. - LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. - OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - return MultiHistogramEven( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - lower_level, - upper_level, - num_pixels, - 1, - sizeof(SampleT) * NUM_CHANNELS * num_pixels, - stream, - debug_synchronous); - } - - - /** - * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. - * - * \par - * - The input is a sequence of pixel structures, where each pixel comprises - * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). - * - Of the \p NUM_CHANNELS specified, the function will only compute histograms - * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA - * pixel samples). - * - A two-dimensional region of interest within \p d_samples can be specified - * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. - * - The row stride must be a whole multiple of the sample data type - * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. - * - The number of histogram bins for channeli is num_levels[i] - 1. - * - For channeli, the range of values for all histogram bins - * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of three 256-bin RGB histograms from a 2x3 region of - * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples - * // and output histograms - * int num_row_pixels; // e.g., 3 - * int num_rows; // e.g., 2 - * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS - * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), - * // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] - * int* d_histogram[3]; // e.g., three device pointers to three device buffers, - * // each allocated with 256 integer counters - * int num_levels[3]; // e.g., {257, 257, 257}; - * unsigned int lower_level[3]; // e.g., {0, 0, 0}; - * unsigned int upper_level[3]; // e.g., {256, 256, 256}; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_pixels, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_pixels, num_rows, row_stride_bytes); - * - * // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], - * // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], - * // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] - * - * \endcode - * - * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - int NUM_CHANNELS, - int NUM_ACTIVE_CHANNELS, - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t MultiHistogramEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. - int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. - LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. - LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - Int2Type is_byte_sample; - - if ((sizeof(OffsetT) > sizeof(int)) && - ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) - { - // Down-convert OffsetT data type - - - return DipatchHistogram::DispatchEven( - d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, - (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), - stream, debug_synchronous, is_byte_sample); - } - - return DipatchHistogram::DispatchEven( - d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, - num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), - stream, debug_synchronous, is_byte_sample); - } - - - //@} end member group - /******************************************************************//** - * \name Custom bin ranges - *********************************************************************/ - //@{ - - /** - * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. - * - * \par - * - The number of histogram bins is (\p num_levels - 1) - * - The value range for bini is [level[i], level[i+1]) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of an six-bin histogram - * from a sequence of float samples - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples and - * // output histogram - * int num_samples; // e.g., 10 - * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] - * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] - * int num_levels // e.g., 7 (seven level boundaries for six bins) - * float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_samples); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_samples); - * - * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; - * - * \endcode - * - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t HistogramRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. - CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. - int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. - LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_samples, ///< [in] The number of data samples per row in the region of interest - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - CounterT* d_histogram1[1] = {d_histogram}; - int num_levels1[1] = {num_levels}; - LevelT* d_levels1[1] = {d_levels}; - - return MultiHistogramRange<1, 1>( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram1, - num_levels1, - d_levels1, - num_samples, - 1, - sizeof(SampleT) * num_samples, - stream, - debug_synchronous); - } - - - /** - * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. - * - * \par - * - A two-dimensional region of interest within \p d_samples can be specified - * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. - * - The row stride must be a whole multiple of the sample data type - * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. - * - The number of histogram bins is (\p num_levels - 1) - * - The value range for bini is [level[i], level[i+1]) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of a six-bin histogram - * from a 2x5 region of interest within a flattened 2x7 array of float samples. - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples and - * // output histogram - * int num_row_samples; // e.g., 5 - * int num_rows; // e.g., 2; - * int row_stride_bytes; // e.g., 7 * sizeof(float) - * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, - * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] - * int* d_histogram; // e.g., [ , , , , , , , ] - * int num_levels // e.g., 7 (seven level boundaries for six bins) - * float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, - * num_row_samples, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, - * num_row_samples, num_rows, row_stride_bytes); - * - * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; - * - * \endcode - * - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t HistogramRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. - CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. - int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. - LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - CounterT* d_histogram1[1] = {d_histogram}; - int num_levels1[1] = {num_levels}; - LevelT* d_levels1[1] = {d_levels}; - - return MultiHistogramRange<1, 1>( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram1, - num_levels1, - d_levels1, - num_row_samples, - num_rows, - row_stride_bytes, - stream, - debug_synchronous); - } - - /** - * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. - * - * \par - * - The input is a sequence of pixel structures, where each pixel comprises - * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). - * - Of the \p NUM_CHANNELS specified, the function will only compute histograms - * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA - * pixel samples). - * - The number of histogram bins for channeli is num_levels[i] - 1. - * - For channeli, the range of values for all histogram bins - * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of three 4-bin RGB histograms - * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples - * // and output histograms - * int num_pixels; // e.g., 5 - * unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), - * // (0, 6, 7, 5),(3, 0, 2, 6)] - * unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; - * int num_levels[3]; // e.g., {5, 5, 5}; - * unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8] ]; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_pixels); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_pixels); - * - * // d_histogram <-- [ [1, 3, 0, 1], - * // [3, 0, 0, 2], - * // [0, 2, 0, 3] ] - * - * \endcode - * - * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - int NUM_CHANNELS, - int NUM_ACTIVE_CHANNELS, - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t MultiHistogramRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. - int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. - LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - return MultiHistogramRange( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - d_levels, - num_pixels, - 1, - sizeof(SampleT) * NUM_CHANNELS * num_pixels, - stream, - debug_synchronous); - } - - - /** - * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. - * - * \par - * - The input is a sequence of pixel structures, where each pixel comprises - * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). - * - Of the \p NUM_CHANNELS specified, the function will only compute histograms - * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA - * pixel samples). - * - A two-dimensional region of interest within \p d_samples can be specified - * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. - * - The row stride must be a whole multiple of the sample data type - * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. - * - The number of histogram bins for channeli is num_levels[i] - 1. - * - For channeli, the range of values for all histogram bins - * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of three 4-bin RGB histograms from a 2x3 region of - * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples - * // and output histograms - * int num_row_pixels; // e.g., 3 - * int num_rows; // e.g., 2 - * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS - * unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), - * // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] - * int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; - * int num_levels[3]; // e.g., {5, 5, 5}; - * unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8] ]; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); - * - * // d_histogram <-- [ [2, 3, 0, 1], - * // [3, 0, 0, 2], - * // [1, 2, 0, 3] ] - * - * \endcode - * - * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - int NUM_CHANNELS, - int NUM_ACTIVE_CHANNELS, - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t MultiHistogramRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. - int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. - LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - Int2Type is_byte_sample; - - if ((sizeof(OffsetT) > sizeof(int)) && - ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) - { - // Down-convert OffsetT data type - return DipatchHistogram::DispatchRange( - d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, - (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), - stream, debug_synchronous, is_byte_sample); - } - - return DipatchHistogram::DispatchRange( - d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, - num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), - stream, debug_synchronous, is_byte_sample); - } - - - - //@} end member group -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_partition.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_partition.cuh deleted file mode 100644 index 154506e..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_partition.cuh +++ /dev/null @@ -1,273 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_select_if.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png) - * \ingroup SingleModule - * - * \par Overview - * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from - * a specified input sequence. - * - * \par Usage Considerations - * \cdp_class{DevicePartition} - * - * \par Performance - * \linear_performance{partition} - * - * \par - * The following chart illustrates DevicePartition::If - * performance across different CUDA architectures for \p int32 items, - * where 50% of the items are randomly selected for the first partition. - * \plots_below - * - * \image html partition_if_int32_50_percent.png - * - */ -struct DevicePartition -{ - /** - * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png) - * - * \par - * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). - * - Copies of the selected items are compacted into \p d_out and maintain their original - * relative ordering, however copies of the unselected items are compacted into the - * rear of \p d_out in reverse order. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] - * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] - * // d_num_selected_out <-- [4] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator - * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator - */ - template < - typename InputIteratorT, - typename FlagIterator, - typename OutputIteratorT, - typename NumSelectedIteratorT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Flagged( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) - int num_items, ///< [in] Total number of items to select from - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType SelectOp; // Selection op (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DispatchSelectIf::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_out, - d_num_selected_out, - SelectOp(), - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png) - * - * \par - * - Copies of the selected items are compacted into \p d_out and maintain their original - * relative ordering, however copies of the unselected items are compacted into the - * rear of \p d_out in reverse order. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated partition-if performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Items are - * selected for the first partition with 50% probability. - * - * \image html partition_if_int32_50_percent.png - * \image html partition_if_int64_50_percent.png - * - * \par - * The following charts are similar, but 5% selection probability for the first partition: - * - * \image html partition_if_int32_5_percent.png - * \image html partition_if_int64_5_percent.png - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Functor type for selecting values less than some criteria - * struct LessThan - * { - * int compare; - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * LessThan(int compare) : compare(compare) {} - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * bool operator()(const int &a) const { - * return (a < compare); - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * LessThan select_op(7); - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] - * // d_num_selected_out <-- [5] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator - * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator - * \tparam SelectOp [inferred] Selection functor type having member bool operator()(const T &a) - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename NumSelectedIteratorT, - typename SelectOp> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t If( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) - int num_items, ///< [in] Total number of items to select from - SelectOp select_op, ///< [in] Unary selection operator - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType* FlagIterator; // FlagT iterator type (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DispatchSelectIf::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - NULL, - d_out, - d_num_selected_out, - select_op, - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - -}; - -/** - * \example example_device_partition_flagged.cu - * \example example_device_partition_if.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_radix_sort.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_radix_sort.cuh deleted file mode 100644 index fe6cad6..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_radix_sort.cuh +++ /dev/null @@ -1,796 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_radix_sort.cuh" -#include "../util_arch.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png) - * \ingroup SingleModule - * - * \par Overview - * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges - * items into ascending (or descending) order. The algorithm relies upon a positional representation for - * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, - * characters, etc.) specified from least-significant to most-significant. For a - * given input sequence of keys and a set of rules specifying a total ordering - * of the symbolic alphabet, the radix sorting method produces a lexicographic - * ordering of those keys. - * - * \par - * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: - * unsigned char, \p int, \p double, etc. Although the direct radix sorting - * method can only be applied to unsigned integral types, DeviceRadixSort - * is able to sort signed and floating-point types via simple bit-wise transformations - * that ensure lexicographic key ordering. - * - * \par Usage Considerations - * \cdp_class{DeviceRadixSort} - * - * \par Performance - * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys - * performance across different CUDA architectures for uniform-random \p uint32 keys. - * \plots_below - * - * \image html lsb_radix_sort_int32_keys.png - * - */ -struct DeviceRadixSort -{ - - /******************************************************************//** - * \name KeyT-value pairs - *********************************************************************/ - //@{ - - /** - * \brief Sorts key-value pairs into ascending order. (~2N auxiliary storage required) - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated sorting performance across different - * CUDA architectures for uniform-random uint32,uint32 and - * uint64,uint64 pairs, respectively. - * - * \image html lsb_radix_sort_int32_pairs.png - * \image html lsb_radix_sort_int64_pairs.png - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [ ... ] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [ ... ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); - * - * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] - * // d_values_out <-- [5, 4, 3, 1, 2, 0, 6] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - * \tparam ValueT [inferred] ValueT type - */ - template < - typename KeyT, - typename ValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairs( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data - const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items - ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts key-value pairs into ascending order. (~N auxiliary storage required) - * - * \par - * - The sorting operation is given a pair of key buffers and a corresponding - * pair of associated value buffers. Each pair is managed by a DoubleBuffer - * structure that indicates which of the two buffers is "current" (and thus - * contains the input data to be sorted). - * - The contents of both buffers within each pair may be altered by the sorting - * operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within each DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated sorting performance across different - * CUDA architectures for uniform-random uint32,uint32 and - * uint64,uint64 pairs, respectively. - * - * \image html lsb_radix_sort_int32_pairs.png - * \image html lsb_radix_sort_int64_pairs.png - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] - * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - * \tparam ValueT [inferred] ValueT type - */ - template < - typename KeyT, - typename ValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairs( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts key-value pairs into descending order. (~2N auxiliary storage required). - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Performance - * Performance is similar to DeviceRadixSort::SortPairs. - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [ ... ] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [ ... ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); - * - * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0] - * // d_values_out <-- [6, 0, 2, 1, 3, 4, 5] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - * \tparam ValueT [inferred] ValueT type - */ - template < - typename KeyT, - typename ValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairsDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data - const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items - ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts key-value pairs into descending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers and a corresponding - * pair of associated value buffers. Each pair is managed by a DoubleBuffer - * structure that indicates which of the two buffers is "current" (and thus - * contains the input data to be sorted). - * - The contents of both buffers within each pair may be altered by the sorting - * operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within each DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Performance - * Performance is similar to DeviceRadixSort::SortPairs. - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] - * // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - * \tparam ValueT [inferred] ValueT type - */ - template < - typename KeyT, - typename ValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairsDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - //@} end member group - /******************************************************************//** - * \name Keys-only - *********************************************************************/ - //@{ - - - /** - * \brief Sorts keys into ascending order. (~2N auxiliary storage required) - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated sorting performance across different - * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. - * - * \image html lsb_radix_sort_int32_keys.png - * \image html lsb_radix_sort_int64_keys.png - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [ ... ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); - * - * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - */ - template - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeys( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts keys into ascending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within the DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated sorting performance across different - * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. - * - * \image html lsb_radix_sort_int32_keys.png - * \image html lsb_radix_sort_int64_keys.png - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - */ - template - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeys( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_values; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - /** - * \brief Sorts keys into descending order. (~2N auxiliary storage required). - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Performance - * Performance is similar to DeviceRadixSort::SortKeys. - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [ ... ] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); - * - * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - */ - template - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeysDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts keys into descending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within the DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Performance - * Performance is similar to DeviceRadixSort::SortKeys. - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - */ - template - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeysDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_values; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - //@} end member group - - -}; - -/** - * \example example_device_radix_sort.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_reduce.cuh deleted file mode 100644 index 3939a7e..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_reduce.cuh +++ /dev/null @@ -1,734 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include -#include - -#include "../iterator/arg_index_input_iterator.cuh" -#include "dispatch/dispatch_reduce.cuh" -#include "dispatch/dispatch_reduce_by_key.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png) - * \ingroup SingleModule - * - * \par Overview - * A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a sequence of input elements. - * - * \par Usage Considerations - * \cdp_class{DeviceReduce} - * - * \par Performance - * \linear_performance{reduction, reduce-by-key, and run-length encode} - * - * \par - * The following chart illustrates DeviceReduce::Sum - * performance across different CUDA architectures for \p int32 keys. - * - * \image html reduce_int32.png - * - * \par - * The following chart illustrates DeviceReduce::ReduceByKey (summation) - * performance across different CUDA architectures for \p fp32 - * values. Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. - * - * \image html reduce_by_key_fp32_len_500.png - * - * \par - * \plots_below - * - */ -struct DeviceReduce -{ - /** - * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init. - * - * \par - * - Does not support binary reduction operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * __device__ __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * CustomMin min_op; - * int init; // e.g., INT_MAX - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduction - * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); - * - * // d_out <-- [0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ReductionOpT, - typename T> - CUB_RUNTIME_FUNCTION - static cudaError_t Reduce( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOpT reduction_op, ///< [in] Binary reduction functor - T init, ///< [in] Initial value of the reduction - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - reduction_op, - init, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide sum using the addition (\p +) operator. - * - * \par - * - Uses \p 0 as the initial value of the reduction. - * - Does not support \p + operators that are non-commutative.. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated sum-reduction performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. - * - * \image html reduce_int32.png - * \image html reduce_int64.png - * - * \par Snippet - * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sum-reduction - * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [38] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Sum( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - cub::Sum(), - OutputT(), // zero-initialize - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide minimum using the less-than ('<') operator. - * - * \par - * - Uses std::numeric_limits::max() as the initial value of the reduction. - * - Does not support \p < operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run min-reduction - * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Min( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - cub::Min(), - Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item. - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The minimum is written to d_out.value and its offset in the input array is written to d_out.key. - * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs - * - Does not support \p < operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmin-reduction - * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); - * - * // d_out <-- [{5, 0}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMin( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input type - typedef typename std::iterator_traits::value_type InputValueT; - - // The output tuple type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - KeyValuePair, // ... then the key value pair OffsetT + InputValueT - typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type - - // The output value type - typedef typename OutputTupleT::Value OutputValueT; - - // Wrapped input iterator to produce index-value tuples - typedef ArgIndexInputIterator ArgIndexInputIteratorT; - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_items, - cub::ArgMin(), - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide maximum using the greater-than ('>') operator. - * - * \par - * - Uses std::numeric_limits::lowest() as the initial value of the reduction. - * - Does not support \p > operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run max-reduction - * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); - * - * // d_out <-- [9] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Max( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - cub::Max(), - Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The maximum is written to d_out.value and its offset in the input array is written to d_out.key. - * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs - * - Does not support \p > operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmax-reduction - * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); - * - * // d_out <-- [{6, 9}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMax( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input type - typedef typename std::iterator_traits::value_type InputValueT; - - // The output tuple type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - KeyValuePair, // ... then the key value pair OffsetT + InputValueT - typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type - - // The output value type - typedef typename OutputTupleT::Value OutputValueT; - - // Wrapped input iterator to produce index-value tuples - typedef ArgIndexInputIterator ArgIndexInputIteratorT; - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_items, - cub::ArgMax(), - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys. - * - * \par - * This operation computes segmented reductions within \p d_values_in using - * the specified binary \p reduction_op functor. The segments are identified by - * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of - * consecutive, identical keys. For the ith run encountered, - * the first key of the run and the corresponding value aggregate of that run are - * written to d_unique_out[i] and d_aggregates_out[i], - * respectively. The total number of runs encountered is written to \p d_num_runs_out. - * - * \par - * - The == equality operator is used to determine whether keys are equivalent - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Performance - * The following chart illustrates reduction-by-key (sum) performance across - * different CUDA architectures for \p fp32 and \p fp64 values, respectively. Segments - * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. - * - * \image html reduce_by_key_fp32_len_500.png - * \image html reduce_by_key_fp64_len_500.png - * - * \par - * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: - * - * \image html reduce_by_key_fp32_len_5.png - * \image html reduce_by_key_fp64_len_5.png - * - * \par Snippet - * The code snippet below illustrates the segmented reduction of \p int values grouped - * by runs of associated \p int keys. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] - * int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] - * int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] - * int *d_num_runs_out; // e.g., [-] - * CustomMin reduction_op; - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduce-by-key - * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); - * - * // d_unique_out <-- [0, 2, 9, 5, 8] - * // d_aggregates_out <-- [0, 1, 6, 2, 4] - * // d_num_runs_out <-- [5] - * - * \endcode - * - * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading input keys \iterator - * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output keys \iterator - * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading input values \iterator - * \tparam AggregatesOutputIterator [inferred] Random-access output iterator type for writing output value aggregates \iterator - * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator - * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - */ - template < - typename KeysInputIteratorT, - typename UniqueOutputIteratorT, - typename ValuesInputIteratorT, - typename AggregatesOutputIteratorT, - typename NumRunsOutputIteratorT, - typename ReductionOpT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t ReduceByKey( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys - UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) - ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values - AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) - ReductionOpT reduction_op, ///< [in] Binary reduction functor - int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // FlagT iterator type (not used) - - // Selection op (not used) - - // Default == operator - typedef Equality EqualityOp; - - return DispatchReduceByKey::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - EqualityOp(), - reduction_op, - num_items, - stream, - debug_synchronous); - } - -}; - -/** - * \example example_device_reduce.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_run_length_encode.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_run_length_encode.cuh deleted file mode 100644 index ed0bf9c..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_run_length_encode.cuh +++ /dev/null @@ -1,278 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_rle.cuh" -#include "dispatch/dispatch_reduce_by_key.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png) - * \ingroup SingleModule - * - * \par Overview - * A run-length encoding - * computes a simple compressed representation of a sequence of input elements such that each - * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a - * count of the elements in that run. - * - * \par Usage Considerations - * \cdp_class{DeviceRunLengthEncode} - * - * \par Performance - * \linear_performance{run-length encode} - * - * \par - * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across - * different CUDA architectures for \p int32 items. - * Segments have lengths uniformly sampled from [1,1000]. - * - * \image html rle_int32_len_500.png - * - * \par - * \plots_below - * - */ -struct DeviceRunLengthEncode -{ - - /** - * \brief Computes a run-length encoding of the sequence \p d_in. - * - * \par - * - For the ith run encountered, the first key of the run and its length are written to - * d_unique_out[i] and d_counts_out[i], - * respectively. - * - The total number of runs encountered is written to \p d_num_runs_out. - * - The == equality operator is used to determine whether values are equivalent - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated encode performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have - * lengths uniformly sampled from [1,1000]. - * - * \image html rle_int32_len_500.png - * \image html rle_int64_len_500.png - * - * \par - * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: - * - * \image html rle_int32_len_5.png - * \image html rle_int64_len_5.png - * - * \par Snippet - * The code snippet below illustrates the run-length encoding of a sequence of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_unique_out; // e.g., [ , , , , , , , ] - * int *d_counts_out; // e.g., [ , , , , , , , ] - * int *d_num_runs_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run encoding - * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); - * - * // d_unique_out <-- [0, 2, 9, 5, 8] - * // d_counts_out <-- [1, 2, 1, 3, 1] - * // d_num_runs_out <-- [5] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output items \iterator - * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing output counts \iterator - * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator - */ - template < - typename InputIteratorT, - typename UniqueOutputIteratorT, - typename LengthsOutputIteratorT, - typename NumRunsOutputIteratorT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Encode( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of keys - UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) - LengthsOutputIteratorT d_counts_out, ///< [out] Pointer to the output sequence of run-lengths (one count per run) - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs - int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType* FlagIterator; // FlagT iterator type (not used) - typedef NullType SelectOp; // Selection op (not used) - typedef Equality EqualityOp; // Default == operator - typedef cub::Sum ReductionOp; // Value reduction operator - - // The lengths output value type - typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? - OffsetT, // ... then the OffsetT type, - typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type - - // Generator type for providing 1s values for run-length reduction - typedef ConstantInputIterator LengthsInputIteratorT; - - return DispatchReduceByKey::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_unique_out, - LengthsInputIteratorT((LengthT) 1), - d_counts_out, - d_num_runs_out, - EqualityOp(), - ReductionOp(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in. - * - * \par - * - For the ith non-trivial run, the run's starting offset - * and its length are written to d_offsets_out[i] and - * d_lengths_out[i], respectively. - * - The total number of runs encountered is written to \p d_num_runs_out. - * - The == equality operator is used to determine whether values are equivalent - * - \devicestorage - * - * \par Performance - * - * \par Snippet - * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_offsets_out; // e.g., [ , , , , , , , ] - * int *d_lengths_out; // e.g., [ , , , , , , , ] - * int *d_num_runs_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run encoding - * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); - * - * // d_offsets_out <-- [1, 4] - * // d_lengths_out <-- [2, 3] - * // d_num_runs_out <-- [2] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OffsetsOutputIteratorT [inferred] Random-access output iterator type for writing run-offset values \iterator - * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing run-length values \iterator - * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator - */ - template < - typename InputIteratorT, - typename OffsetsOutputIteratorT, - typename LengthsOutputIteratorT, - typename NumRunsOutputIteratorT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t NonTrivialRuns( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run) - LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run) - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) - int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef Equality EqualityOp; // Default == operator - - return DeviceRleDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_scan.cuh deleted file mode 100644 index 4589279..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_scan.cuh +++ /dev/null @@ -1,443 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_scan.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png) - * \ingroup SingleModule - * - * \par Overview - * Given a sequence of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) - * produces an output sequence where each element is computed to be the reduction - * of the elements occurring earlier in the input sequence. Prefix sum - * connotes a prefix scan with the addition operator. The term \em inclusive indicates - * that the ith output reduction incorporates the ith input. - * The term \em exclusive indicates the ith input is not incorporated into - * the ith output reduction. - * - * \par - * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our "decoupled look-back" algorithm - * for performing global prefix scan with only a single pass through the - * input data, as described in our 2016 technical report [1]. The central - * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies - * of global prefix propagation with local computation. As such, our algorithm requires only - * ~2n data movement (n inputs are read, n outputs are written), and typically - * proceeds at "memcpy" speeds. - * - * \par - * [1] [Duane Merrill and Michael Garland. "Single-pass Parallel Prefix Scan with Decoupled Look-back", NVIDIA Technical Report NVR-2016-002, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back) - * - * \par Usage Considerations - * \cdp_class{DeviceScan} - * - * \par Performance - * \linear_performance{prefix scan} - * - * \par - * The following chart illustrates DeviceScan::ExclusiveSum - * performance across different CUDA architectures for \p int32 keys. - * \plots_below - * - * \image html scan_int32.png - * - */ -struct DeviceScan -{ - /******************************************************************//** - * \name Exclusive scans - *********************************************************************/ - //@{ - - /** - * \brief Computes a device-wide exclusive prefix sum. The value of 0 is applied as the initial value, and is assigned to *d_out. - * - * \par - * - Supports non-commutative sum operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated exclusive sum performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. - * - * \image html scan_int32.png - * \image html scan_int64.png - * - * \par Snippet - * The code snippet below illustrates the exclusive prefix sum of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix sum - * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out s<-- [0, 8, 14, 21, 26, 29, 29] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ExclusiveSum( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // Initial value - OutputT init_value = 0; - - return DispatchScan::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - Sum(), - init_value, - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor. The \p init_value value is applied as the initial value, and is assigned to *d_out. - * - * \par - * - Supports non-commutative scan operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * CustomMin min_op - * ... - * - * // Determine temporary device storage requirements for exclusive prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); - * - * // Allocate temporary storage for exclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix min-scan - * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); - * - * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam Identity [inferred] Type of the \p identity value used Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ScanOpT, - typename InitValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t ExclusiveScan( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - ScanOpT scan_op, ///< [in] Binary scan functor - InitValueT init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out) - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScan::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - init_value, - num_items, - stream, - debug_synchronous); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive scans - *********************************************************************/ - //@{ - - - /** - * \brief Computes a device-wide inclusive prefix sum. - * - * \par - * - Supports non-commutative sum operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the inclusive prefix sum of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements for inclusive prefix sum - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage for inclusive prefix sum - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix sum - * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [8, 14, 21, 26, 29, 29, 38] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t InclusiveSum( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScan::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - Sum(), - NullType(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor. - * - * \par - * - Supports non-commutative scan operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * CustomMin min_op; - * ... - * - * // Determine temporary device storage requirements for inclusive prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); - * - * // Allocate temporary storage for inclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix min-scan - * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); - * - * // d_out <-- [8, 6, 6, 5, 3, 0, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ScanOpT> - CUB_RUNTIME_FUNCTION - static cudaError_t InclusiveScan( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - ScanOpT scan_op, ///< [in] Binary scan functor - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScan::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - NullType(), - num_items, - stream, - debug_synchronous); - } - - //@} end member group - -}; - -/** - * \example example_device_scan.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_radix_sort.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_radix_sort.cuh deleted file mode 100644 index 7f8bf8e..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_radix_sort.cuh +++ /dev/null @@ -1,875 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_radix_sort.cuh" -#include "../util_arch.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png) - * \ingroup SegmentedModule - * - * \par Overview - * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges - * items into ascending (or descending) order. The algorithm relies upon a positional representation for - * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, - * characters, etc.) specified from least-significant to most-significant. For a - * given input sequence of keys and a set of rules specifying a total ordering - * of the symbolic alphabet, the radix sorting method produces a lexicographic - * ordering of those keys. - * - * \par - * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: - * unsigned char, \p int, \p double, etc. Although the direct radix sorting - * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort - * is able to sort signed and floating-point types via simple bit-wise transformations - * that ensure lexicographic key ordering. - * - * \par Usage Considerations - * \cdp_class{DeviceSegmentedRadixSort} - * - */ -struct DeviceSegmentedRadixSort -{ - - /******************************************************************//** - * \name Key-value pairs - *********************************************************************/ - //@{ - - /** - * \brief Sorts segments of key-value pairs into ascending order. (~2N auxiliary storage required) - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] - * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename ValueT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairs( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data - const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items - ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts segments of key-value pairs into ascending order. (~N auxiliary storage required) - * - * \par - * - The sorting operation is given a pair of key buffers and a corresponding - * pair of associated value buffers. Each pair is managed by a DoubleBuffer - * structure that indicates which of the two buffers is "current" (and thus - * contains the input data to be sorted). - * - The contents of both buffers within each pair may be altered by the sorting - * operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within each DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] - * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename ValueT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairs( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts segments of key-value pairs into descending order. (~2N auxiliary storage required). - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] - * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename ValueT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairsDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data - const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items - ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts segments of key-value pairs into descending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers and a corresponding - * pair of associated value buffers. Each pair is managed by a DoubleBuffer - * structure that indicates which of the two buffers is "current" (and thus - * contains the input data to be sorted). - * - The contents of both buffers within each pair may be altered by the sorting - * operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within each DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] - * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename ValueT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairsDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - //@} end member group - /******************************************************************//** - * \name Keys-only - *********************************************************************/ - //@{ - - - /** - * \brief Sorts segments of keys into ascending order. (~2N auxiliary storage required) - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeys( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts segments of keys into ascending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within the DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeys( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_values; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - /** - * \brief Sorts segments of keys into descending order. (~2N auxiliary storage required). - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeysDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts segments of keys into descending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within the DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeysDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_values; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - //@} end member group - - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_reduce.cuh deleted file mode 100644 index 1964ec1..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_segmented_reduce.cuh +++ /dev/null @@ -1,619 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "../iterator/arg_index_input_iterator.cuh" -#include "dispatch/dispatch_reduce.cuh" -#include "dispatch/dispatch_reduce_by_key.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png) - * \ingroup SegmentedModule - * - * \par Overview - * A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a sequence of input elements. - * - * \par Usage Considerations - * \cdp_class{DeviceSegmentedReduce} - * - */ -struct DeviceSegmentedReduce -{ - /** - * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor. - * - * \par - * - Does not support binary reduction operators that are non-commutative. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * CustomMin min_op; - * int initial_value; // e.g., INT_MAX - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduction - * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); - * - * // d_out <-- [6, INT_MAX, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT, - typename ReductionOp, - typename T> - CUB_RUNTIME_FUNCTION - static cudaError_t Reduce( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - ReductionOp reduction_op, ///< [in] Binary reduction functor - T initial_value, ///< [in] Initial value of the reduction for each segment - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - reduction_op, - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide segmented sum using the addition ('+') operator. - * - * \par - * - Uses \p 0 as the initial value of the reduction for each segment. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p + operators that are non-commutative.. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the sum reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sum-reduction - * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [21, 0, 17] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Sum( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::Sum(), - OutputT(), // zero-initialize - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide segmented minimum using the less-than ('<') operator. - * - * \par - * - Uses std::numeric_limits::max() as the initial value of the reduction for each segment. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p < operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run min-reduction - * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [6, INT_MAX, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Min( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::Min(), - Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item. - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The minimum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. - * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p < operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmin-reduction - * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMin( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input type - typedef typename std::iterator_traits::value_type InputValueT; - - // The output tuple type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - KeyValuePair, // ... then the key value pair OffsetT + InputValueT - typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type - - // The output value type - typedef typename OutputTupleT::Value OutputValueT; - - // Wrapped input iterator to produce index-value tuples - typedef ArgIndexInputIterator ArgIndexInputIteratorT; - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::ArgMin(), - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator. - * - * \par - * - Uses std::numeric_limits::lowest() as the initial value of the reduction. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p > operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run max-reduction - * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [8, INT_MIN, 9] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Max( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::Max(), - Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The maximum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. - * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p > operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmax-reduction - * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMax( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input type - typedef typename std::iterator_traits::value_type InputValueT; - - // The output tuple type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - KeyValuePair, // ... then the key value pair OffsetT + InputValueT - typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type - - // The output value type - typedef typename OutputTupleT::Value OutputValueT; - - // Wrapped input iterator to produce index-value tuples - typedef ArgIndexInputIterator ArgIndexInputIteratorT; - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::ArgMax(), - initial_value, - stream, - debug_synchronous); - } - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_select.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_select.cuh deleted file mode 100644 index 58bfe82..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_select.cuh +++ /dev/null @@ -1,369 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_select_if.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png) - * \ingroup SingleModule - * - * \par Overview - * These operations apply a selection criterion to selectively copy - * items from a specified input sequence to a compact output sequence. - * - * \par Usage Considerations - * \cdp_class{DeviceSelect} - * - * \par Performance - * \linear_performance{select-flagged, select-if, and select-unique} - * - * \par - * The following chart illustrates DeviceSelect::If - * performance across different CUDA architectures for \p int32 items, - * where 50% of the items are randomly selected. - * - * \image html select_if_int32_50_percent.png - * - * \par - * The following chart illustrates DeviceSelect::Unique - * performance across different CUDA architectures for \p int32 items - * where segments have lengths uniformly sampled from [1,1000]. - * - * \image html select_unique_int32_len_500.png - * - * \par - * \plots_below - * - */ -struct DeviceSelect -{ - /** - * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png) - * - * \par - * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). - * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] - * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // d_out <-- [1, 4, 6, 7] - * // d_num_selected_out <-- [4] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator - * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator - */ - template < - typename InputIteratorT, - typename FlagIterator, - typename OutputIteratorT, - typename NumSelectedIteratorT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Flagged( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType SelectOp; // Selection op (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DispatchSelectIf::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_out, - d_num_selected_out, - SelectOp(), - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png) - * - * \par - * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated select-if performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Items are - * selected with 50% probability. - * - * \image html select_if_int32_50_percent.png - * \image html select_if_int64_50_percent.png - * - * \par - * The following charts are similar, but 5% selection probability: - * - * \image html select_if_int32_5_percent.png - * \image html select_if_int64_5_percent.png - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Functor type for selecting values less than some criteria - * struct LessThan - * { - * int compare; - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * LessThan(int compare) : compare(compare) {} - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * bool operator()(const int &a) const { - * return (a < compare); - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * LessThan select_op(7); - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // d_out <-- [0, 2, 3, 5, 2] - * // d_num_selected_out <-- [5] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator - * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator - * \tparam SelectOp [inferred] Selection operator type having member bool operator()(const T &a) - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename NumSelectedIteratorT, - typename SelectOp> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t If( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - SelectOp select_op, ///< [in] Unary selection operator - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType* FlagIterator; // FlagT iterator type (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DispatchSelectIf::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - NULL, - d_out, - d_num_selected_out, - select_op, - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png) - * - * \par - * - The == equality operator is used to determine whether keys are equivalent - * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated select-unique performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have - * lengths uniformly sampled from [1,1000]. - * - * \image html select_unique_int32_len_500.png - * \image html select_unique_int64_len_500.png - * - * \par - * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: - * - * \image html select_unique_int32_len_5.png - * \image html select_unique_int64_len_5.png - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); - * - * // d_out <-- [0, 2, 9, 5, 8] - * // d_num_selected_out <-- [5] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator - * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename NumSelectedIteratorT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Unique( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType* FlagIterator; // FlagT iterator type (not used) - typedef NullType SelectOp; // Selection op (not used) - typedef Equality EqualityOp; // Default == operator - - return DispatchSelectIf::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - NULL, - d_out, - d_num_selected_out, - SelectOp(), - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - -}; - -/** - * \example example_device_select_flagged.cu - * \example example_device_select_if.cu - * \example example_device_select_unique.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_spmv.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_spmv.cuh deleted file mode 100644 index 8f3a4c5..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/device_spmv.cuh +++ /dev/null @@ -1,174 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). - */ - -#pragma once - -#include -#include -#include - -#include "dispatch/dispatch_spmv_orig.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV). - * \ingroup SingleModule - * - * \par Overview - * The [SpMV computation](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication) - * performs the matrix-vector operation - * y = alpha*A*x + beta*y, - * where: - * - A is an mxn sparse matrix whose non-zero structure is specified in - * [compressed-storage-row (CSR) format](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29) - * (i.e., three arrays: values, row_offsets, and column_indices) - * - x and y are dense vectors - * - alpha and beta are scalar multiplicands - * - * \par Usage Considerations - * \cdp_class{DeviceSpmv} - * - */ -struct DeviceSpmv -{ - /******************************************************************//** - * \name CSR matrix operations - *********************************************************************/ - //@{ - - /** - * \brief This function performs the matrix-vector operation y = A*x. - * - * \par Snippet - * The code snippet below illustrates SpMV upon a 9x9 CSR matrix A - * representing a 3x3 lattice (24 non-zeros). - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x, - * // and output vector y - * int num_rows = 9; - * int num_cols = 9; - * int num_nonzeros = 24; - * - * float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, - * // 1, 1, 1, 1, 1, 1, 1, 1, - * // 1, 1, 1, 1, 1, 1, 1, 1] - * - * int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, - * // 4, 6, 1, 3, 5, 7, 2, 4, - * // 8, 3, 7, 4, 6, 8, 5, 7] - * - * int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] - * - * float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] - * float* d_vector_y; // e.g., [ , , , , , , , , ] - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, - * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, - * num_rows, num_cols, num_nonzeros, alpha, beta); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run SpMV - * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, - * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, - * num_rows, num_cols, num_nonzeros, alpha, beta); - * - * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] - * - * \endcode - * - * \tparam ValueT [inferred] Matrix and vector value type (e.g., /p float, /p double, etc.) - */ - template < - typename ValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t CsrMV( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. - int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros) - int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) - ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x - ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y - int num_rows, ///< [in] number of rows of matrix A. - int num_cols, ///< [in] number of columns of matrix A. - int num_nonzeros, ///< [in] number of nonzero elements of matrix A. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - SpmvParams spmv_params; - spmv_params.d_values = d_values; - spmv_params.d_row_end_offsets = d_row_offsets + 1; - spmv_params.d_column_indices = d_column_indices; - spmv_params.d_vector_x = d_vector_x; - spmv_params.d_vector_y = d_vector_y; - spmv_params.num_rows = num_rows; - spmv_params.num_cols = num_cols; - spmv_params.num_nonzeros = num_nonzeros; - spmv_params.alpha = 1.0; - spmv_params.beta = 0.0; - - return DispatchSpmv::Dispatch( - d_temp_storage, - temp_storage_bytes, - spmv_params, - stream, - debug_synchronous); - } - - //@} end member group -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_histogram.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_histogram.cuh deleted file mode 100644 index cdebd8b..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_histogram.cuh +++ /dev/null @@ -1,1096 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. - */ - -#pragma once - -#include -#include -#include - -#include "../../agent/agent_histogram.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../thread/thread_search.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - - -/****************************************************************************** - * Histogram kernel entry points - *****************************************************************************/ - -/** - * Histogram initialization kernel entry point - */ -template < - int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename CounterT, ///< Integer type for counting sample occurrences per histogram bin - typename OffsetT> ///< Signed integer type for global offsets -__global__ void DeviceHistogramInitKernel( - ArrayWrapper num_output_bins_wrapper, ///< Number of output histogram bins per channel - ArrayWrapper d_output_histograms_wrapper, ///< Histogram counter data having logical dimensions CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]] - GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks -{ - if ((threadIdx.x == 0) && (blockIdx.x == 0)) - tile_queue.ResetDrain(); - - int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x; - - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - if (output_bin < num_output_bins_wrapper.array[CHANNEL]) - d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0; - } -} - - -/** - * Histogram privatized sweep kernel entry point (multi-block). Computes privatized histograms, one per thread block. - */ -template < - typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type - int PRIVATIZED_SMEM_BINS, ///< Maximum number of histogram bins per channel (e.g., up to 256) - int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename SampleIteratorT, ///< The input iterator type. \iterator. - typename CounterT, ///< Integer type for counting sample occurrences per histogram bin - typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel - typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS)) -__global__ void DeviceHistogramSweepKernel( - SampleIteratorT d_samples, ///< Input data to reduce - ArrayWrapper num_output_bins_wrapper, ///< The number bins per final output histogram - ArrayWrapper num_privatized_bins_wrapper, ///< The number bins per privatized histogram - ArrayWrapper d_output_histograms_wrapper, ///< Reference to final output histograms - ArrayWrapper d_privatized_histograms_wrapper, ///< Reference to privatized histograms - ArrayWrapper output_decode_op_wrapper, ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel - ArrayWrapper privatized_decode_op_wrapper, ///< The transform operator for determining privatized counter indices from samples, one for each channel - OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< The number of rows in the region of interest - OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest - int tiles_per_row, ///< Number of image tiles per row - GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks -{ - // Thread block type for compositing input tiles - typedef AgentHistogram< - AgentHistogramPolicyT, - PRIVATIZED_SMEM_BINS, - NUM_CHANNELS, - NUM_ACTIVE_CHANNELS, - SampleIteratorT, - CounterT, - PrivatizedDecodeOpT, - OutputDecodeOpT, - OffsetT> - AgentHistogramT; - - // Shared memory for AgentHistogram - __shared__ typename AgentHistogramT::TempStorage temp_storage; - - AgentHistogramT agent( - temp_storage, - d_samples, - num_output_bins_wrapper.array, - num_privatized_bins_wrapper.array, - d_output_histograms_wrapper.array, - d_privatized_histograms_wrapper.array, - output_decode_op_wrapper.array, - privatized_decode_op_wrapper.array); - - // Initialize counters - agent.InitBinCounters(); - - // Consume input tiles - agent.ConsumeTiles( - num_row_pixels, - num_rows, - row_stride_samples, - tiles_per_row, - tile_queue); - - // Store output to global (if necessary) - agent.StoreOutput(); - -} - - - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram - */ -template < - int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename SampleIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename CounterT, ///< Integer type for counting sample occurrences per histogram bin - typename LevelT, ///< Type for specifying bin level boundaries - typename OffsetT> ///< Signed integer type for global offsets -struct DipatchHistogram -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - enum - { - // Maximum number of bins per channel for which we will use a privatized smem strategy - MAX_PRIVATIZED_SMEM_BINS = 256 - }; - - - //--------------------------------------------------------------------- - // Transform functors for converting samples to bin-ids - //--------------------------------------------------------------------- - - // Searches for bin given a list of bin-boundary levels - template - struct SearchTransform - { - LevelIteratorT d_levels; // Pointer to levels array - int num_output_levels; // Number of levels in array - - // Initializer - __host__ __device__ __forceinline__ void Init( - LevelIteratorT d_levels, // Pointer to levels array - int num_output_levels) // Number of levels in array - { - this->d_levels = d_levels; - this->num_output_levels = num_output_levels; - } - - // Method for converting samples to bin-ids - template - __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) - { - /// Level iterator wrapper type - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - LevelIteratorT>::Type // Directly use the supplied input iterator type - WrappedLevelIteratorT; - - WrappedLevelIteratorT wrapped_levels(d_levels); - - int num_bins = num_output_levels - 1; - if (valid) - { - bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1; - if (bin >= num_bins) - bin = -1; - } - } - }; - - - // Scales samples to evenly-spaced bins - struct ScaleTransform - { - int num_bins; // Number of levels in array - LevelT max; // Max sample level (exclusive) - LevelT min; // Min sample level (inclusive) - LevelT scale; // Bin scaling factor - - // Initializer - template - __host__ __device__ __forceinline__ void Init( - int num_output_levels, // Number of levels in array - _LevelT max, // Max sample level (exclusive) - _LevelT min, // Min sample level (inclusive) - _LevelT scale) // Bin scaling factor - { - this->num_bins = num_output_levels - 1; - this->max = max; - this->min = min; - this->scale = scale; - } - - // Initializer (float specialization) - __host__ __device__ __forceinline__ void Init( - int num_output_levels, // Number of levels in array - float max, // Max sample level (exclusive) - float min, // Min sample level (inclusive) - float scale) // Bin scaling factor - { - this->num_bins = num_output_levels - 1; - this->max = max; - this->min = min; - this->scale = float(1.0) / scale; - } - - // Initializer (double specialization) - __host__ __device__ __forceinline__ void Init( - int num_output_levels, // Number of levels in array - double max, // Max sample level (exclusive) - double min, // Min sample level (inclusive) - double scale) // Bin scaling factor - { - this->num_bins = num_output_levels - 1; - this->max = max; - this->min = min; - this->scale = double(1.0) / scale; - } - - // Method for converting samples to bin-ids - template - __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) - { - LevelT level_sample = (LevelT) sample; - - if (valid && (level_sample >= min) && (level_sample < max)) - bin = (int) ((level_sample - min) / scale); - } - - // Method for converting samples to bin-ids (float specialization) - template - __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid) - { - LevelT level_sample = (LevelT) sample; - - if (valid && (level_sample >= min) && (level_sample < max)) - bin = (int) ((level_sample - min) * scale); - } - - // Method for converting samples to bin-ids (double specialization) - template - __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid) - { - LevelT level_sample = (LevelT) sample; - - if (valid && (level_sample >= min) && (level_sample < max)) - bin = (int) ((level_sample - min) * scale); - } - }; - - - // Pass-through bin transform operator - struct PassThruTransform - { - // Method for converting samples to bin-ids - template - __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) - { - if (valid) - bin = (int) sample; - } - }; - - - - //--------------------------------------------------------------------- - // Tuning policies - //--------------------------------------------------------------------- - - template - struct TScale - { - enum - { - V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), - VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1) - }; - }; - - - /// SM11 - struct Policy110 - { - // HistogramSweepPolicy - typedef AgentHistogramPolicy< - 512, - (NUM_CHANNELS == 1) ? 8 : 2, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - true, - GMEM, - false> - HistogramSweepPolicy; - }; - - /// SM20 - struct Policy200 - { - // HistogramSweepPolicy - typedef AgentHistogramPolicy< - (NUM_CHANNELS == 1) ? 256 : 128, - (NUM_CHANNELS == 1) ? 8 : 3, - (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - SMEM, - false> - HistogramSweepPolicy; - }; - - /// SM30 - struct Policy300 - { - // HistogramSweepPolicy - typedef AgentHistogramPolicy< - 512, - (NUM_CHANNELS == 1) ? 8 : 2, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - true, - GMEM, - false> - HistogramSweepPolicy; - }; - - /// SM35 - struct Policy350 - { - // HistogramSweepPolicy - typedef AgentHistogramPolicy< - 128, - TScale<8>::VALUE, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - true, - BLEND, - true> - HistogramSweepPolicy; - }; - - /// SM50 - struct Policy500 - { - // HistogramSweepPolicy - typedef AgentHistogramPolicy< - 384, - TScale<16>::VALUE, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - true, - SMEM, - false> - HistogramSweepPolicy; - }; - - - - //--------------------------------------------------------------------- - // Tuning policies of current PTX compiler pass - //--------------------------------------------------------------------- - -#if (CUB_PTX_ARCH >= 500) - typedef Policy500 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#else - typedef Policy110 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {}; - - - //--------------------------------------------------------------------- - // Utilities - //--------------------------------------------------------------------- - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t InitConfigs( - int ptx_version, - KernelConfig &histogram_sweep_config) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - return histogram_sweep_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 500) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 350) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 300) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 200) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 110) - { - return histogram_sweep_config.template Init(); - } - else - { - // No global atomic support - return cudaErrorNotSupported; - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration - */ - struct KernelConfig - { - int block_threads; - int pixels_per_thread; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Init() - { - block_threads = BlockPolicy::BLOCK_THREADS; - pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD; - - return cudaSuccess; - } - }; - - - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- - - /** - * Privatization-based dispatch routine - */ - template < - typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel - typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel - typename DeviceHistogramInitKernelT, ///< Function type of cub::DeviceHistogramInitKernel - typename DeviceHistogramSweepKernelT> ///< Function type of cub::DeviceHistogramSweepKernel - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t PrivatizedDispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. - int num_privatized_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel - int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel - int max_num_output_bins, ///< [in] Maximum number of output bins in any channel - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest - DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel - DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel - KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - #ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - - #else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Get SM occupancy for histogram_sweep_kernel - int histogram_sweep_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - histogram_sweep_sm_occupancy, - histogram_sweep_kernel, - histogram_sweep_config.block_threads))) break; - - // Get device occupancy for histogram_sweep_kernel - int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count; - - if (num_row_pixels * NUM_CHANNELS == row_stride_samples) - { - // Treat as a single linear array of samples - num_row_pixels *= num_rows; - num_rows = 1; - row_stride_samples = num_row_pixels * NUM_CHANNELS; - } - - // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy - int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread; - int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile; - int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row); - int blocks_per_col = (blocks_per_row > 0) ? - int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : - 0; - int num_thread_blocks = blocks_per_row * blocks_per_col; - - dim3 sweep_grid_dims; - sweep_grid_dims.x = (unsigned int) blocks_per_row; - sweep_grid_dims.y = (unsigned int) blocks_per_col; - sweep_grid_dims.z = 1; - - // Temporary storage allocation requirements - const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1; - void* allocations[NUM_ALLOCATIONS]; - size_t allocation_sizes[NUM_ALLOCATIONS]; - - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT); - - allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue::AllocationSize(); - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the grid queue descriptor - GridQueue tile_queue(allocations[NUM_ALLOCATIONS - 1]); - - // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) - ArrayWrapper d_output_histograms_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL]; - - // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters) - ArrayWrapper d_privatized_histograms_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL]; - - // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters) - ArrayWrapper privatized_decode_op_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL]; - - // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters) - ArrayWrapper output_decode_op_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL]; - - // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters) - ArrayWrapper num_privatized_bins_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1; - - // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters) - ArrayWrapper num_output_bins_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1; - - int histogram_init_block_threads = 256; - int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads; - - // Log DeviceHistogramInitKernel configuration - if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n", - histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); - - // Invoke histogram_init_kernel - histogram_init_kernel<<>>( - num_output_bins_wrapper, - d_output_histograms_wrapper, - tile_queue); - - // Return if empty problem - if ((blocks_per_row == 0) || (blocks_per_col == 0)) - break; - - // Log histogram_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n", - sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z, - histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy); - - // Invoke histogram_sweep_kernel - histogram_sweep_kernel<<>>( - d_samples, - num_output_bins_wrapper, - num_privatized_bins_wrapper, - d_output_histograms_wrapper, - d_privatized_histograms_wrapper, - output_decode_op_wrapper, - privatized_decode_op_wrapper, - num_row_pixels, - num_rows, - row_stride_samples, - tiles_per_row, - tile_queue); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - } - while (0); - - return error; - - #endif // CUB_RUNTIME_ENABLED - } - - - - /** - * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit - */ - CUB_RUNTIME_FUNCTION - static cudaError_t DispatchRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. - int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel dispatch configurations - KernelConfig histogram_sweep_config; - if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) - break; - - // Use the search transform op for converting samples to privatized bins - typedef SearchTransform PrivatizedDecodeOpT; - - // Use the pass-thru transform op for converting privatized bins to output bins - typedef PassThruTransform OutputDecodeOpT; - - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; - int max_levels = num_output_levels[0]; - - for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) - { - privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); - if (num_output_levels[channel] > max_levels) - max_levels = num_output_levels[channel]; - } - int max_num_output_bins = max_levels - 1; - - // Dispatch - if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) - { - // Too many bins to keep in shared memory. - const int PRIVATIZED_SMEM_BINS = 0; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - } - else - { - // Dispatch shared-privatized approach - const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - } - - } while (0); - - return error; - } - - - /** - * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) - */ - CUB_RUNTIME_FUNCTION - static cudaError_t DispatchRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. - int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel dispatch configurations - KernelConfig histogram_sweep_config; - if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) - break; - - // Use the pass-thru transform op for converting samples to privatized bins - typedef PassThruTransform PrivatizedDecodeOpT; - - // Use the search transform op for converting privatized bins to output bins - typedef SearchTransform OutputDecodeOpT; - - int num_privatized_levels[NUM_ACTIVE_CHANNELS]; - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; - int max_levels = num_output_levels[0]; // Maximum number of levels in any channel - - for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) - { - num_privatized_levels[channel] = 257; - output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); - - if (num_output_levels[channel] > max_levels) - max_levels = num_output_levels[channel]; - } - int max_num_output_bins = max_levels - 1; - - const int PRIVATIZED_SMEM_BINS = 256; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_privatized_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - - } while (0); - - return error; - } - - - /** - * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t DispatchEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. - int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. - LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel dispatch configurations - KernelConfig histogram_sweep_config; - if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) - break; - - // Use the scale transform op for converting samples to privatized bins - typedef ScaleTransform PrivatizedDecodeOpT; - - // Use the pass-thru transform op for converting privatized bins to output bins - typedef PassThruTransform OutputDecodeOpT; - - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; - int max_levels = num_output_levels[0]; - - for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) - { - int bins = num_output_levels[channel] - 1; - LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; - - privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); - - if (num_output_levels[channel] > max_levels) - max_levels = num_output_levels[channel]; - } - int max_num_output_bins = max_levels - 1; - - if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) - { - // Dispatch shared-privatized approach - const int PRIVATIZED_SMEM_BINS = 0; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - } - else - { - // Dispatch shared-privatized approach - const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - } - } - while (0); - - return error; - } - - - /** - * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t DispatchEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. - int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. - LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel dispatch configurations - KernelConfig histogram_sweep_config; - if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) - break; - - // Use the pass-thru transform op for converting samples to privatized bins - typedef PassThruTransform PrivatizedDecodeOpT; - - // Use the scale transform op for converting privatized bins to output bins - typedef ScaleTransform OutputDecodeOpT; - - int num_privatized_levels[NUM_ACTIVE_CHANNELS]; - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; - int max_levels = num_output_levels[0]; - - for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) - { - num_privatized_levels[channel] = 257; - - int bins = num_output_levels[channel] - 1; - LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; - output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); - - if (num_output_levels[channel] > max_levels) - max_levels = num_output_levels[channel]; - } - int max_num_output_bins = max_levels - 1; - - const int PRIVATIZED_SMEM_BINS = 256; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_privatized_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - - } - while (0); - - return error; - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_radix_sort.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_radix_sort.cuh deleted file mode 100644 index b4559ad..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_radix_sort.cuh +++ /dev/null @@ -1,1657 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "../../agent/agent_radix_sort_upsweep.cuh" -#include "../../agent/agent_radix_sort_downsweep.cuh" -#include "../../agent/agent_scan.cuh" -#include "../../block/block_radix_sort.cuh" -#include "../../grid/grid_even_share.cuh" -#include "../../util_type.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Upsweep digit-counting kernel entry point (multi-block). Computes privatized digit histograms, one per block. - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int((ALT_DIGIT_BITS) ? - ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS : - ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)) -__global__ void DeviceRadixSortUpsweepKernel( - const KeyT *d_keys, ///< [in] Input keys buffer - OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - OffsetT /*num_items*/, ///< [in] Total number of input data items - int current_bit, ///< [in] Bit position of current radix digit - int num_bits, ///< [in] Number of bits of current radix digit - GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block -{ - enum { - TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * - ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD - }; - - // Parameterize AgentRadixSortUpsweep type for the current configuration - typedef AgentRadixSortUpsweep< - typename If<(ALT_DIGIT_BITS), - typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy, - typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type, - KeyT, - OffsetT> - AgentRadixSortUpsweepT; - - // Shared memory storage - __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage; - - // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block - even_share.template BlockInit(); - - AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits); - - upsweep.ProcessRegion(even_share.block_offset, even_share.block_end); - - CTA_SYNC(); - - // Write out digit counts (striped) - upsweep.ExtractCounts(d_spine, gridDim.x, blockIdx.x); -} - - -/** - * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) -__global__ void RadixSortScanBinsKernel( - OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - int num_counts) ///< [in] Total number of bin-counts -{ - // Parameterize the AgentScan type for the current configuration - typedef AgentScan< - typename ChainedPolicyT::ActivePolicy::ScanPolicy, - OffsetT*, - OffsetT*, - cub::Sum, - OffsetT, - OffsetT> - AgentScanT; - - // Shared memory storage - __shared__ typename AgentScanT::TempStorage temp_storage; - - // Block scan instance - AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ; - - // Process full input tiles - int block_offset = 0; - BlockScanRunningPrefixOp prefix_op(0, Sum()); - while (block_offset + AgentScanT::TILE_ITEMS <= num_counts) - { - block_scan.template ConsumeTile(block_offset, prefix_op); - block_offset += AgentScanT::TILE_ITEMS; - } -} - - -/** - * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place. - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int((ALT_DIGIT_BITS) ? - ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS : - ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)) -__global__ void DeviceRadixSortDownsweepKernel( - const KeyT *d_keys_in, ///< [in] Input keys buffer - KeyT *d_keys_out, ///< [in] Output keys buffer - const ValueT *d_values_in, ///< [in] Input values buffer - ValueT *d_values_out, ///< [in] Output values buffer - OffsetT *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - OffsetT num_items, ///< [in] Total number of input data items - int current_bit, ///< [in] Bit position of current radix digit - int num_bits, ///< [in] Number of bits of current radix digit - GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block -{ - enum { - TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * - ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD - }; - - // Parameterize AgentRadixSortDownsweep type for the current configuration - typedef AgentRadixSortDownsweep< - typename If<(ALT_DIGIT_BITS), - typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy, - typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type, - IS_DESCENDING, - KeyT, - ValueT, - OffsetT> - AgentRadixSortDownsweepT; - - // Shared memory storage - __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage; - - // Initialize even-share descriptor for this thread block - even_share.template BlockInit(); - - // Process input tiles - AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion( - even_share.block_offset, - even_share.block_end); -} - - -/** - * Single pass kernel entry point (single-block). Fully sorts a tile of input. - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) -__global__ void DeviceRadixSortSingleTileKernel( - const KeyT *d_keys_in, ///< [in] Input keys buffer - KeyT *d_keys_out, ///< [in] Output keys buffer - const ValueT *d_values_in, ///< [in] Input values buffer - ValueT *d_values_out, ///< [in] Output values buffer - OffsetT num_items, ///< [in] Total number of input data items - int current_bit, ///< [in] Bit position of current radix digit - int end_bit) ///< [in] The past-the-end (most-significant) bit index needed for key comparison -{ - // Constants - enum - { - BLOCK_THREADS = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD, - KEYS_ONLY = Equals::VALUE, - }; - - // BlockRadixSort type - typedef BlockRadixSort< - KeyT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - ValueT, - ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS, - (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE), - ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM> - BlockRadixSortT; - - // BlockLoad type (keys) - typedef BlockLoad< - KeyT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys; - - // BlockLoad type (values) - typedef BlockLoad< - ValueT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues; - - // Unsigned word for key bits - typedef typename Traits::UnsignedBits UnsignedBitsT; - - // Shared memory storage - __shared__ union TempStorage - { - typename BlockRadixSortT::TempStorage sort; - typename BlockLoadKeys::TempStorage load_keys; - typename BlockLoadValues::TempStorage load_values; - - } temp_storage; - - // Keys and values for the block - KeyT keys[ITEMS_PER_THREAD]; - ValueT values[ITEMS_PER_THREAD]; - - // Get default (min/max) value for out-of-bounds keys - UnsignedBitsT default_key_bits = (IS_DESCENDING) ? Traits::LOWEST_KEY : Traits::MAX_KEY; - KeyT default_key = reinterpret_cast(default_key_bits); - - // Load keys - BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key); - - CTA_SYNC(); - - // Load values - if (!KEYS_ONLY) - { - BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); - - CTA_SYNC(); - } - - // Sort tile - BlockRadixSortT(temp_storage.sort).SortBlockedToStriped( - keys, - values, - current_bit, - end_bit, - Int2Type(), - Int2Type()); - - // Store keys and values - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int item_offset = ITEM * BLOCK_THREADS + threadIdx.x; - if (item_offset < num_items) - { - d_keys_out[item_offset] = keys[ITEM]; - if (!KEYS_ONLY) - d_values_out[item_offset] = values[ITEM]; - } - } -} - - -/** - * Segmented radix sorting pass (one block per segment) - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int((ALT_DIGIT_BITS) ? - ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : - ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) -__global__ void DeviceSegmentedRadixSortKernel( - const KeyT *d_keys_in, ///< [in] Input keys buffer - KeyT *d_keys_out, ///< [in] Output keys buffer - const ValueT *d_values_in, ///< [in] Input values buffer - ValueT *d_values_out, ///< [in] Output values buffer - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data - int current_bit, ///< [in] Bit position of current radix digit - int pass_bits) ///< [in] Number of bits of current radix digit -{ - // - // Constants - // - - typedef typename If<(ALT_DIGIT_BITS), - typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy, - typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT; - - enum - { - BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD, - RADIX_BITS = SegmentedPolicyT::RADIX_BITS, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - RADIX_DIGITS = 1 << RADIX_BITS, - KEYS_ONLY = Equals::VALUE, - }; - - // Upsweep type - typedef AgentRadixSortUpsweep< - AgentRadixSortUpsweepPolicy, - KeyT, - OffsetT> - BlockUpsweepT; - - // Digit-scan type - typedef BlockScan DigitScanT; - - // Downsweep type - typedef AgentRadixSortDownsweep BlockDownsweepT; - - enum - { - /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD - }; - - // - // Process input tiles - // - - // Shared memory storage - __shared__ union - { - typename BlockUpsweepT::TempStorage upsweep; - typename BlockDownsweepT::TempStorage downsweep; - struct - { - volatile OffsetT reverse_counts_in[RADIX_DIGITS]; - volatile OffsetT reverse_counts_out[RADIX_DIGITS]; - typename DigitScanT::TempStorage scan; - }; - - } temp_storage; - - OffsetT segment_begin = d_begin_offsets[blockIdx.x]; - OffsetT segment_end = d_end_offsets[blockIdx.x]; - OffsetT num_items = segment_end - segment_begin; - - // Check if empty segment - if (num_items <= 0) - return; - - // Upsweep - BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits); - upsweep.ProcessRegion(segment_begin, segment_end); - - CTA_SYNC(); - - // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) - OffsetT bin_count[BINS_TRACKED_PER_THREAD]; - upsweep.ExtractCounts(bin_count); - - CTA_SYNC(); - - if (IS_DESCENDING) - { - // Reverse bin counts - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - temp_storage.reverse_counts_in[bin_idx] = bin_count[track]; - } - - CTA_SYNC(); - - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1]; - } - } - - // Scan - OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads) - DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset); - - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - bin_offset[track] += segment_begin; - } - - if (IS_DESCENDING) - { - // Reverse bin offsets - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track]; - } - - CTA_SYNC(); - - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1]; - } - } - - CTA_SYNC(); - - // Downsweep - BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits); - downsweep.ProcessRegion(segment_begin, segment_end); -} - - - -/****************************************************************************** - * Policy - ******************************************************************************/ - -/** - * Tuning policy for kernel specialization - */ -template < - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetT> ///< Signed integer type for global offsets -struct DeviceRadixSortPolicy -{ - //------------------------------------------------------------------------------ - // Constants - //------------------------------------------------------------------------------ - - enum - { - // Whether this is a keys-only (or key-value) sort - KEYS_ONLY = (Equals::VALUE), - - // Relative size of KeyT type to a 4-byte word - SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, - }; - - //------------------------------------------------------------------------------ - // Architecture-specific tuning policies - //------------------------------------------------------------------------------ - - /// SM13 - struct Policy130 : ChainedPolicy<130, Policy130, Policy130> - { - enum { - PRIMARY_RADIX_BITS = 5, - ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, - }; - - // Keys-only upsweep policies - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; - - // Key-value pairs upsweep policies - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; - - // Upsweep policies - typedef typename If::Type UpsweepPolicy; - typedef typename If::Type AltUpsweepPolicy; - - // Scan policy - typedef AgentScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; - - // Keys-only downsweep policies - typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; - - // Key-value pairs downsweep policies - typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; - - // Downsweep policies - typedef typename If::Type DownsweepPolicy; - typedef typename If::Type AltDownsweepPolicy; - - // Single-tile policy - typedef DownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - }; - - /// SM20 - struct Policy200 : ChainedPolicy<200, Policy200, Policy130> - { - enum { - PRIMARY_RADIX_BITS = 5, - ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, - }; - - // Keys-only upsweep policies - typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; - typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; - - // Key-value pairs upsweep policies - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; - - // Upsweep policies - typedef typename If::Type UpsweepPolicy; - typedef typename If::Type AltUpsweepPolicy; - - // Scan policy - typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Keys-only downsweep policies - typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; - - // Key-value pairs downsweep policies - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; - - // Downsweep policies - typedef typename If::Type DownsweepPolicy; - typedef typename If::Type AltDownsweepPolicy; - - // Single-tile policy - typedef DownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - }; - - /// SM30 - struct Policy300 : ChainedPolicy<300, Policy300, Policy200> - { - enum { - PRIMARY_RADIX_BITS = 5, - ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, - }; - - // Keys-only upsweep policies - typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; - typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; - - // Key-value pairs upsweep policies - typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; - typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; - - // Upsweep policies - typedef typename If::Type UpsweepPolicy; - typedef typename If::Type AltUpsweepPolicy; - - // Scan policy - typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; - - // Keys-only downsweep policies - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; - - // Key-value pairs downsweep policies - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; - - // Downsweep policies - typedef typename If::Type DownsweepPolicy; - typedef typename If::Type AltDownsweepPolicy; - - // Single-tile policy - typedef DownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - }; - - - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> - { - enum { - PRIMARY_RADIX_BITS = 6, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m) - }; - - // Scan policy - typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; - - // Keys-only downsweep policies - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 9 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys; - - // Key-value pairs downsweep policies - typedef DownsweepPolicyKeys DownsweepPolicyPairs; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs; - - // Downsweep policies - typedef typename If::Type DownsweepPolicy; - typedef typename If::Type AltDownsweepPolicy; - - // Upsweep policies - typedef DownsweepPolicy UpsweepPolicy; - typedef AltDownsweepPolicy AltUpsweepPolicy; - - // Single-tile policy - typedef DownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - - - }; - - - /// SM50 - struct Policy500 : ChainedPolicy<500, Policy500, Policy350> - { - enum { - PRIMARY_RADIX_BITS = 7, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX) - SINGLE_TILE_RADIX_BITS = 6, - SEGMENTED_RADIX_BITS = 6, // 3.1B 32b segmented keys/s (TitanX) - }; - - // ScanPolicy - typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Downsweep policies - typedef AgentRadixSortDownsweepPolicy <160, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; - - // Upsweep policies - typedef DownsweepPolicy UpsweepPolicy; - typedef AltDownsweepPolicy AltUpsweepPolicy; - - // Single-tile policy - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; - - // Segmented policies - typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 31 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; - }; - - - /// SM60 (GP100) - struct Policy600 : ChainedPolicy<600, Policy600, Policy500> - { - enum { - PRIMARY_RADIX_BITS = 7, // 6.9B 32b keys/s (Quadro P100) - SINGLE_TILE_RADIX_BITS = 6, - SEGMENTED_RADIX_BITS = 6, // 5.9B 32b segmented keys/s (Quadro P100) - }; - - // ScanPolicy - typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Downsweep policies - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 25 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; - - // Upsweep policies - typedef DownsweepPolicy UpsweepPolicy; - typedef AltDownsweepPolicy AltUpsweepPolicy; - - // Single-tile policy - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; - - // Segmented policies - typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; - - }; - - - /// SM61 (GP104) - struct Policy610 : ChainedPolicy<610, Policy610, Policy600> - { - enum { - PRIMARY_RADIX_BITS = 7, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080) - SINGLE_TILE_RADIX_BITS = 6, - SEGMENTED_RADIX_BITS = 6, // 3.3B 32b segmented keys/s (1080) - }; - - // ScanPolicy - typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Downsweep policies - typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 31 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 35 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; - - // Upsweep policies - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS> UpsweepPolicy; - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; - - // Single-tile policy - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; - - // Segmented policies - typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; - }; - - - /// SM62 (Tegra, less RF) - struct Policy620 : ChainedPolicy<620, Policy620, Policy610> - { - enum { - PRIMARY_RADIX_BITS = 5, - ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, - }; - - // ScanPolicy - typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Downsweep policies - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS> AltDownsweepPolicy; - - // Upsweep policies - typedef DownsweepPolicy UpsweepPolicy; - typedef AltDownsweepPolicy AltUpsweepPolicy; - - // Single-tile policy - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - }; - - - /// SM70 (GV100) - struct Policy700 : ChainedPolicy<700, Policy700, Policy620> - { - enum { - PRIMARY_RADIX_BITS = 6, // 7.62B 32b keys/s (GV100) - SINGLE_TILE_RADIX_BITS = 6, - SEGMENTED_RADIX_BITS = 6, // 8.7B 32b segmented keys/s (GV100) - }; - - // ScanPolicy - typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Downsweep policies - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 47 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 29 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; - - // Upsweep policies - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 47 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> UpsweepPolicy; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 29 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; - - // Single-tile policy - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; - - // Segmented policies - typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; - }; - - - /// MaxPolicy - typedef Policy700 MaxPolicy; - - -}; - - - -/****************************************************************************** - * Single-problem dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort - */ -template < - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetT> ///< Signed integer type for global offsets -struct DispatchRadixSort : - DeviceRadixSortPolicy -{ - //------------------------------------------------------------------------------ - // Constants - //------------------------------------------------------------------------------ - - enum - { - // Whether this is a keys-only (or key-value) sort - KEYS_ONLY = (Equals::VALUE), - }; - - - //------------------------------------------------------------------------------ - // Problem state - //------------------------------------------------------------------------------ - - void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - OffsetT num_items; ///< [in] Number of items to sort - int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version; ///< [in] PTX version - bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers - - - //------------------------------------------------------------------------------ - // Constructor - //------------------------------------------------------------------------------ - - /// Constructor - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchRadixSort( - void* d_temp_storage, - size_t &temp_storage_bytes, - DoubleBuffer &d_keys, - DoubleBuffer &d_values, - OffsetT num_items, - int begin_bit, - int end_bit, - bool is_overwrite_okay, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_keys(d_keys), - d_values(d_values), - num_items(num_items), - begin_bit(begin_bit), - end_bit(end_bit), - stream(stream), - debug_synchronous(debug_synchronous), - ptx_version(ptx_version), - is_overwrite_okay(is_overwrite_okay) - {} - - - //------------------------------------------------------------------------------ - // Small-problem (single tile) invocation - //------------------------------------------------------------------------------ - - /// Invoke a single block to sort in-core - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename SingleTileKernelT> ///< Function type of cub::DeviceRadixSortSingleTileKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokeSingleTile( - SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void)single_tile_kernel; - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - cudaError error = cudaSuccess; - do - { - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - { - temp_storage_bytes = 1; - break; - } - - // Return if empty problem - if (num_items == 0) - break; - - // Log single_tile_kernel configuration - if (debug_synchronous) - _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", - 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, - ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS); - - // Invoke upsweep_kernel with same grid size as downsweep_kernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( - d_keys.Current(), - d_keys.Alternate(), - d_values.Current(), - d_values.Alternate(), - num_items, - begin_bit, - end_bit); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Update selector - d_keys.selector ^= 1; - d_values.selector ^= 1; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - //------------------------------------------------------------------------------ - // Normal problem size invocation - //------------------------------------------------------------------------------ - - /** - * Invoke a three-kernel sorting pass at the current bit. - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePass( - const KeyT *d_keys_in, - KeyT *d_keys_out, - const ValueT *d_values_in, - ValueT *d_values_out, - OffsetT *d_spine, - int spine_length, - int ¤t_bit, - PassConfigT &pass_config) - { - cudaError error = cudaSuccess; - do - { - int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); - - // Log upsweep_kernel configuration - if (debug_synchronous) - _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", - pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream, - pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits); - - /*printf("running upsweep kernel with pass_bits %d\n", pass_bits);*/ - /*printf("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",*/ - /*pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,*/ - /*pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);*/ - - // Invoke upsweep_kernel with same grid size as downsweep_kernel - pass_config.upsweep_kernel<<>>( - d_keys_in, - d_spine, - num_items, - current_bit, - pass_bits, - pass_config.even_share); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Log scan_kernel configuration - if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", - 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread); - - // Invoke scan_kernel - pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>( - d_spine, - spine_length); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Log downsweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream, - pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy); - - // Invoke downsweep_kernel - pass_config.downsweep_kernel<<>>( - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - d_spine, - num_items, - current_bit, - pass_bits, - pass_config.even_share); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Update current bit - current_bit += pass_bits; - } - while (0); - - return error; - } - - - - /// Pass configuration structure - template < - typename UpsweepKernelT, - typename ScanKernelT, - typename DownsweepKernelT> - struct PassConfig - { - UpsweepKernelT upsweep_kernel; - KernelConfig upsweep_config; - ScanKernelT scan_kernel; - KernelConfig scan_config; - DownsweepKernelT downsweep_kernel; - KernelConfig downsweep_config; - int radix_bits; - int radix_digits; - int max_downsweep_grid_size; - GridEvenShare even_share; - - /// Initialize pass configuration - template < - typename UpsweepPolicyT, - typename ScanPolicyT, - typename DownsweepPolicyT> - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InitPassConfig( - UpsweepKernelT upsweep_kernel, - ScanKernelT scan_kernel, - DownsweepKernelT downsweep_kernel, - int ptx_version, - int sm_count, - int num_items) - { - cudaError error = cudaSuccess; - do - { - this->upsweep_kernel = upsweep_kernel; - this->scan_kernel = scan_kernel; - this->downsweep_kernel = downsweep_kernel; - radix_bits = DownsweepPolicyT::RADIX_BITS; - radix_digits = 1 << radix_bits; - - if (CubDebug(error = upsweep_config.Init(upsweep_kernel))) break; - if (CubDebug(error = scan_config.Init(scan_kernel))) break; - if (CubDebug(error = downsweep_config.Init(downsweep_kernel))) break; - - max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version); - - even_share.DispatchInit( - num_items, - max_downsweep_grid_size, - CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); - - } - while (0); - return error; - } - - }; - - - /// Invocation (run multiple digit passes) - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename UpsweepKernelT, ///< Function type of cub::DeviceRadixSortUpsweepKernel - typename ScanKernelT, ///< Function type of cub::SpineScanKernel - typename DownsweepKernelT> ///< Function type of cub::DeviceRadixSortDownsweepKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - UpsweepKernelT upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel - UpsweepKernelT alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel - ScanKernelT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel - DownsweepKernelT downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel - DownsweepKernelT alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void)upsweep_kernel; - (void)alt_upsweep_kernel; - (void)scan_kernel; - (void)downsweep_kernel; - (void)alt_downsweep_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Init regular and alternate-digit kernel configurations - PassConfig pass_config, alt_pass_config; - if ((error = pass_config.template InitPassConfig< - typename ActivePolicyT::UpsweepPolicy, - typename ActivePolicyT::ScanPolicy, - typename ActivePolicyT::DownsweepPolicy>( - upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break; - - if ((error = alt_pass_config.template InitPassConfig< - typename ActivePolicyT::AltUpsweepPolicy, - typename ActivePolicyT::ScanPolicy, - typename ActivePolicyT::AltDownsweepPolicy>( - alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break; - - // Get maximum spine length - int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size); - int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size; - - // Temporary storage allocation requirements - void* allocations[3]; - size_t allocation_sizes[3] = - { - spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms - (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer - (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer - }; - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - return cudaSuccess; - - // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size - int num_bits = end_bit - begin_bit; - int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits; - bool is_num_passes_odd = num_passes & 1; - int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits; - int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); - - // Alias the temporary storage allocations - OffsetT *d_spine = static_cast(allocations[0]); - - DoubleBuffer d_keys_remaining_passes( - (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[1]), - (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_keys.Alternate()); - - DoubleBuffer d_values_remaining_passes( - (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[2]), - (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[2]) : d_values.Alternate()); - - // Run first pass, consuming from the input's current buffers - int current_bit = begin_bit; - if (CubDebug(error = InvokePass( - d_keys.Current(), d_keys_remaining_passes.Current(), - d_values.Current(), d_values_remaining_passes.Current(), - d_spine, spine_length, current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; - - // Run remaining passes - while (current_bit < end_bit) - { - if (CubDebug(error = InvokePass( - d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - d_spine, spine_length, current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;; - - // Invert selectors - d_keys_remaining_passes.selector ^= 1; - d_values_remaining_passes.selector ^= 1; - } - - // Update selector - if (!is_overwrite_okay) { - num_passes = 1; // Sorted data always ends up in the other vector - } - - d_keys.selector = (d_keys.selector + num_passes) & 1; - d_values.selector = (d_values.selector + num_passes) & 1; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - //------------------------------------------------------------------------------ - // Chained policy invocation - //------------------------------------------------------------------------------ - - /// Invocation - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Invoke() - { - typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; - typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; - - // Force kernel code-generation in all compiler passes - if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) - { - // Small, single tile size - return InvokeSingleTile( - DeviceRadixSortSingleTileKernel); - } - else - { - // Regular size - return InvokePasses( - DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>, - DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>, - RadixSortScanBinsKernel< MaxPolicyT, OffsetT>, - DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>, - DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>); - } - } - - - //------------------------------------------------------------------------------ - // Dispatch entrypoints - //------------------------------------------------------------------------------ - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - OffsetT num_items, ///< [in] Number of items to sort - int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison - bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; - - cudaError_t error; - do { - // Get PTX version - int ptx_version; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Create dispatch functor - DispatchRadixSort dispatch( - d_temp_storage, temp_storage_bytes, - d_keys, d_values, - num_items, begin_bit, end_bit, is_overwrite_okay, - stream, debug_synchronous, ptx_version); - - // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - - } while (0); - - return error; - } -}; - - - - -/****************************************************************************** - * Segmented dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort - */ -template < - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator - typename OffsetT> ///< Signed integer type for global offsets -struct DispatchSegmentedRadixSort : - DeviceRadixSortPolicy -{ - //------------------------------------------------------------------------------ - // Constants - //------------------------------------------------------------------------------ - - enum - { - // Whether this is a keys-only (or key-value) sort - KEYS_ONLY = (Equals::VALUE), - }; - - - //------------------------------------------------------------------------------ - // Parameter members - //------------------------------------------------------------------------------ - - void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - OffsetT num_items; ///< [in] Number of items to sort - OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version; ///< [in] PTX version - bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers - - - //------------------------------------------------------------------------------ - // Constructors - //------------------------------------------------------------------------------ - - /// Constructor - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchSegmentedRadixSort( - void* d_temp_storage, - size_t &temp_storage_bytes, - DoubleBuffer &d_keys, - DoubleBuffer &d_values, - OffsetT num_items, - OffsetT num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - bool is_overwrite_okay, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_keys(d_keys), - d_values(d_values), - num_items(num_items), - num_segments(num_segments), - d_begin_offsets(d_begin_offsets), - d_end_offsets(d_end_offsets), - begin_bit(begin_bit), - end_bit(end_bit), - is_overwrite_okay(is_overwrite_okay), - stream(stream), - debug_synchronous(debug_synchronous), - ptx_version(ptx_version) - {} - - - //------------------------------------------------------------------------------ - // Multi-segment invocation - //------------------------------------------------------------------------------ - - /// Invoke a three-kernel sorting pass at the current bit. - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePass( - const KeyT *d_keys_in, - KeyT *d_keys_out, - const ValueT *d_values_in, - ValueT *d_values_out, - int ¤t_bit, - PassConfigT &pass_config) - { - cudaError error = cudaSuccess; - do - { - int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); - - // Log kernel configuration - if (debug_synchronous) - _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", - num_segments, pass_config.segmented_config.block_threads, (long long) stream, - pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits); - - pass_config.segmented_kernel<<>>( - d_keys_in, d_keys_out, - d_values_in, d_values_out, - d_begin_offsets, d_end_offsets, num_segments, - current_bit, pass_bits); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Update current bit - current_bit += pass_bits; - } - while (0); - - return error; - } - - - /// PassConfig data structure - template - struct PassConfig - { - SegmentedKernelT segmented_kernel; - KernelConfig segmented_config; - int radix_bits; - int radix_digits; - - /// Initialize pass configuration - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel) - { - this->segmented_kernel = segmented_kernel; - this->radix_bits = SegmentedPolicyT::RADIX_BITS; - this->radix_digits = 1 << radix_bits; - - return CubDebug(segmented_config.Init(segmented_kernel)); - } - }; - - - /// Invocation (run multiple digit passes) - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename SegmentedKernelT> ///< Function type of cub::DeviceSegmentedRadixSortKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - SegmentedKernelT segmented_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel - SegmentedKernelT alt_segmented_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void)segmented_kernel; - (void)alt_segmented_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - - cudaError error = cudaSuccess; - do - { - // Init regular and alternate kernel configurations - PassConfig pass_config, alt_pass_config; - if ((error = pass_config.template InitPassConfig(segmented_kernel))) break; - if ((error = alt_pass_config.template InitPassConfig(alt_segmented_kernel))) break; - - // Temporary storage allocation requirements - void* allocations[2]; - size_t allocation_sizes[2] = - { - (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer - (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer - }; - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - { - if (temp_storage_bytes == 0) - temp_storage_bytes = 1; - return cudaSuccess; - } - - // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size - int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS; - int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS; - int num_bits = end_bit - begin_bit; - int num_passes = (num_bits + radix_bits - 1) / radix_bits; - bool is_num_passes_odd = num_passes & 1; - int max_alt_passes = (num_passes * radix_bits) - num_bits; - int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits)); - - DoubleBuffer d_keys_remaining_passes( - (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[0]), - (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[0]) : d_keys.Alternate()); - - DoubleBuffer d_values_remaining_passes( - (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[1]), - (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_values.Alternate()); - - // Run first pass, consuming from the input's current buffers - int current_bit = begin_bit; - - if (CubDebug(error = InvokePass( - d_keys.Current(), d_keys_remaining_passes.Current(), - d_values.Current(), d_values_remaining_passes.Current(), - current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; - - // Run remaining passes - while (current_bit < end_bit) - { - if (CubDebug(error = InvokePass( - d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; - - // Invert selectors and update current bit - d_keys_remaining_passes.selector ^= 1; - d_values_remaining_passes.selector ^= 1; - } - - // Update selector - if (!is_overwrite_okay) { - num_passes = 1; // Sorted data always ends up in the other vector - } - - d_keys.selector = (d_keys.selector + num_passes) & 1; - d_values.selector = (d_values.selector + num_passes) & 1; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - //------------------------------------------------------------------------------ - // Chained policy invocation - //------------------------------------------------------------------------------ - - /// Invocation - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Invoke() - { - typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; - - // Force kernel code-generation in all compiler passes - return InvokePasses( - DeviceSegmentedRadixSortKernel, - DeviceSegmentedRadixSortKernel); - } - - - //------------------------------------------------------------------------------ - // Dispatch entrypoints - //------------------------------------------------------------------------------ - - - /// Internal dispatch routine - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] Number of items to sort - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison - bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; - - cudaError_t error; - do { - // Get PTX version - int ptx_version; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Create dispatch functor - DispatchSegmentedRadixSort dispatch( - d_temp_storage, temp_storage_bytes, - d_keys, d_values, - num_items, num_segments, d_begin_offsets, d_end_offsets, - begin_bit, end_bit, is_overwrite_okay, - stream, debug_synchronous, ptx_version); - - // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - - } while (0); - - return error; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce.cuh deleted file mode 100644 index b6aa44c..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce.cuh +++ /dev/null @@ -1,882 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "../../agent/agent_reduce.cuh" -#include "../../iterator/arg_index_input_iterator.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_even_share.cuh" -#include "../../iterator/arg_index_input_iterator.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Reduce region kernel entry point (multi-block). Computes privatized reductions, one per thread block. - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) -__global__ void DeviceReduceKernel( - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - OffsetT num_items, ///< [in] Total number of input data items - GridEvenShare even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block - ReductionOpT reduction_op) ///< [in] Binary reduction functor -{ - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // Thread block type for reducing input tiles - typedef AgentReduce< - typename ChainedPolicyT::ActivePolicy::ReducePolicy, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT> - AgentReduceT; - - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; - - // Consume input tiles - OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); - - // Output result - if (threadIdx.x == 0) - d_out[blockIdx.x] = block_aggregate; -} - - -/** - * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass. - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) - typename OuputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) -__global__ void DeviceReduceSingleTileKernel( - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - OffsetT num_items, ///< [in] Total number of input data items - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OuputT init) ///< [in] The initial value of the reduction -{ - // Thread block type for reducing input tiles - typedef AgentReduce< - typename ChainedPolicyT::ActivePolicy::SingleTilePolicy, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT> - AgentReduceT; - - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; - - // Check if empty problem - if (num_items == 0) - { - if (threadIdx.x == 0) - *d_out = init; - return; - } - - // Consume input tiles - OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( - OffsetT(0), - num_items); - - // Output result - if (threadIdx.x == 0) - *d_out = reduction_op(init, block_aggregate); -} - - -/// Normalize input iterator to segment offset -template -__device__ __forceinline__ -void NormalizeReductionOutput( - T &/*val*/, - OffsetT /*base_offset*/, - IteratorT /*itr*/) -{} - - -/// Normalize input iterator to segment offset (specialized for arg-index) -template -__device__ __forceinline__ -void NormalizeReductionOutput( - KeyValuePairT &val, - OffsetT base_offset, - ArgIndexInputIterator /*itr*/) -{ - val.key -= base_offset; -} - - -/** - * Segmented reduction (one block per segment) - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) - typename OutputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) -__global__ void DeviceSegmentedReduceKernel( - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init) ///< [in] The initial value of the reduction -{ - // Thread block type for reducing input tiles - typedef AgentReduce< - typename ChainedPolicyT::ActivePolicy::ReducePolicy, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT> - AgentReduceT; - - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; - - OffsetT segment_begin = d_begin_offsets[blockIdx.x]; - OffsetT segment_end = d_end_offsets[blockIdx.x]; - - // Check if empty problem - if (segment_begin == segment_end) - { - if (threadIdx.x == 0) - d_out[blockIdx.x] = init; - return; - } - - // Consume input tiles - OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( - segment_begin, - segment_end); - - // Normalize as needed - NormalizeReductionOutput(block_aggregate, segment_begin, d_in); - - if (threadIdx.x == 0) - d_out[blockIdx.x] = reduction_op(init, block_aggregate);; -} - - - - -/****************************************************************************** - * Policy - ******************************************************************************/ - -template < - typename OuputT, ///< Data type - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -struct DeviceReducePolicy -{ - //------------------------------------------------------------------------------ - // Architecture-specific tuning policies - //------------------------------------------------------------------------------ - - /// SM13 - struct Policy130 : ChainedPolicy<130, Policy130, Policy130> - { - // ReducePolicy - typedef AgentReducePolicy< - CUB_NOMINAL_CONFIG(128, 8, OuputT), ///< Threads per block, items per thread - 2, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - - /// SM20 - struct Policy200 : ChainedPolicy<200, Policy200, Policy130> - { - // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items) - typedef AgentReducePolicy< - CUB_NOMINAL_CONFIG(128, 8, OuputT), ///< Threads per block, items per thread - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - - /// SM30 - struct Policy300 : ChainedPolicy<300, Policy300, Policy200> - { - // ReducePolicy (GTX670: 154.0 @ 48M 4B items) - typedef AgentReducePolicy< - CUB_NOMINAL_CONFIG(256, 20, OuputT), ///< Threads per block, items per thread - 2, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> - { - // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items) - typedef AgentReducePolicy< - CUB_NOMINAL_CONFIG(256, 20, OuputT), ///< Threads per block, items per thread - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_LDG> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - /// SM60 - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> - { - // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) - typedef AgentReducePolicy< - CUB_NOMINAL_CONFIG(256, 16, OuputT), ///< Threads per block, items per thread - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_LDG> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - - /// MaxPolicy - typedef Policy600 MaxPolicy; - -}; - - - -/****************************************************************************** - * Single-problem dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction - */ -template < - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -struct DispatchReduce : - DeviceReducePolicy< - typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type, // ... else the output iterator's value type - OffsetT, - ReductionOpT> -{ - //------------------------------------------------------------------------------ - // Constants - //------------------------------------------------------------------------------ - - // Data type of output iterator - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - - //------------------------------------------------------------------------------ - // Problem state - //------------------------------------------------------------------------------ - - void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out; ///< [out] Pointer to the output aggregate - OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOpT reduction_op; ///< [in] Binary reduction functor - OutputT init; ///< [in] The initial value of the reduction - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version; ///< [in] PTX version - - //------------------------------------------------------------------------------ - // Constructor - //------------------------------------------------------------------------------ - - /// Constructor - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchReduce( - void* d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - OutputT init, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_in(d_in), - d_out(d_out), - num_items(num_items), - reduction_op(reduction_op), - init(init), - stream(stream), - debug_synchronous(debug_synchronous), - ptx_version(ptx_version) - {} - - - //------------------------------------------------------------------------------ - // Small-problem (single tile) invocation - //------------------------------------------------------------------------------ - - /// Invoke a single block block to reduce in-core - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokeSingleTile( - SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void)single_tile_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - cudaError error = cudaSuccess; - do - { - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - { - temp_storage_bytes = 1; - break; - } - - // Log single_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", - ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, - (long long) stream, - ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); - - // Invoke single_reduce_sweep_kernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( - d_in, - d_out, - num_items, - reduction_op, - init); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - //------------------------------------------------------------------------------ - // Normal problem size invocation (two-pass) - //------------------------------------------------------------------------------ - - /// Invoke two-passes to reduce - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename ReduceKernelT, ///< Function type of cub::DeviceReduceKernel - typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - ReduceKernelT reduce_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel - SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void) reduce_kernel; - (void) single_tile_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Init regular kernel configuration - KernelConfig reduce_config; - if (CubDebug(error = reduce_config.Init(reduce_kernel))) break; - int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count; - - // Even-share work distribution - int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version); - GridEvenShare even_share; - even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size); - - // Temporary storage allocation requirements - void* allocations[1]; - size_t allocation_sizes[1] = - { - max_blocks * sizeof(OutputT) // bytes needed for privatized block reductions - }; - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - return cudaSuccess; - } - - // Alias the allocation for the privatized per-block reductions - OutputT *d_block_reductions = (OutputT*) allocations[0]; - - // Get grid size for device_reduce_sweep_kernel - int reduce_grid_size = even_share.grid_size; - - // Log device_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - reduce_grid_size, - ActivePolicyT::ReducePolicy::BLOCK_THREADS, - (long long) stream, - ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD, - reduce_config.sm_occupancy); - - // Invoke DeviceReduceKernel - reduce_kernel<<>>( - d_in, - d_block_reductions, - num_items, - even_share, - reduction_op); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Log single_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", - ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, - (long long) stream, - ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); - - // Invoke DeviceReduceSingleTileKernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( - d_block_reductions, - d_out, - reduce_grid_size, - reduction_op, - init); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - - } - - - //------------------------------------------------------------------------------ - // Chained policy invocation - //------------------------------------------------------------------------------ - - /// Invocation - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Invoke() - { - typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; - typedef typename DispatchReduce::MaxPolicy MaxPolicyT; - - // Force kernel code-generation in all compiler passes - if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) - { - // Small, single tile size - return InvokeSingleTile( - DeviceReduceSingleTileKernel); - } - else - { - // Regular size - return InvokePasses( - DeviceReduceKernel, - DeviceReduceSingleTileKernel); - } - } - - - //------------------------------------------------------------------------------ - // Dispatch entrypoints - //------------------------------------------------------------------------------ - - /** - * Internal dispatch routine for computing a device-wide reduction - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init, ///< [in] The initial value of the reduction - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - typedef typename DispatchReduce::MaxPolicy MaxPolicyT; - - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Create dispatch functor - DispatchReduce dispatch( - d_temp_storage, temp_storage_bytes, - d_in, d_out, num_items, reduction_op, init, - stream, debug_synchronous, ptx_version); - - // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - } - while (0); - - return error; - } -}; - - - -/****************************************************************************** - * Segmented dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction - */ -template < - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -struct DispatchSegmentedReduce : - DeviceReducePolicy< - typename std::iterator_traits::value_type, - OffsetT, - ReductionOpT> -{ - //------------------------------------------------------------------------------ - // Constants - //------------------------------------------------------------------------------ - - /// The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - - //------------------------------------------------------------------------------ - // Problem state - //------------------------------------------------------------------------------ - - void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out; ///< [out] Pointer to the output aggregate - OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - ReductionOpT reduction_op; ///< [in] Binary reduction functor - OutputT init; ///< [in] The initial value of the reduction - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version; ///< [in] PTX version - - //------------------------------------------------------------------------------ - // Constructor - //------------------------------------------------------------------------------ - - /// Constructor - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchSegmentedReduce( - void* d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - ReductionOpT reduction_op, - OutputT init, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_in(d_in), - d_out(d_out), - num_segments(num_segments), - d_begin_offsets(d_begin_offsets), - d_end_offsets(d_end_offsets), - reduction_op(reduction_op), - init(init), - stream(stream), - debug_synchronous(debug_synchronous), - ptx_version(ptx_version) - {} - - - - //------------------------------------------------------------------------------ - // Chained policy invocation - //------------------------------------------------------------------------------ - - /// Invocation - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename DeviceSegmentedReduceKernelT> ///< Function type of cub::DeviceSegmentedReduceKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - DeviceSegmentedReduceKernelT segmented_reduce_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void)segmented_reduce_kernel; - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - cudaError error = cudaSuccess; - do - { - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - { - temp_storage_bytes = 1; - return cudaSuccess; - } - - // Init kernel configuration - KernelConfig segmented_reduce_config; - if (CubDebug(error = segmented_reduce_config.Init(segmented_reduce_kernel))) break; - - // Log device_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - num_segments, - ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, - (long long) stream, - ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, - segmented_reduce_config.sm_occupancy); - - // Invoke DeviceReduceKernel - segmented_reduce_kernel<<>>( - d_in, - d_out, - d_begin_offsets, - d_end_offsets, - num_segments, - reduction_op, - init); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - - } - - - /// Invocation - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Invoke() - { - typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; - - // Force kernel code-generation in all compiler passes - return InvokePasses( - DeviceSegmentedReduceKernel); - } - - - //------------------------------------------------------------------------------ - // Dispatch entrypoints - //------------------------------------------------------------------------------ - - /** - * Internal dispatch routine for computing a device-wide reduction - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init, ///< [in] The initial value of the reduction - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; - - if (num_segments <= 0) - return cudaSuccess; - - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Create dispatch functor - DispatchSegmentedReduce dispatch( - d_temp_storage, temp_storage_bytes, - d_in, d_out, - num_segments, d_begin_offsets, d_end_offsets, - reduction_op, init, - stream, debug_synchronous, ptx_version); - - // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - } - while (0); - - return error; - } -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh deleted file mode 100644 index 672bc49..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ /dev/null @@ -1,554 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch_scan.cuh" -#include "../../agent/agent_reduce_by_key.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Multi-block reduce-by-key sweep kernel entry point - */ -template < - typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicyT tuning policy type - typename KeysInputIteratorT, ///< Random-access input iterator type for keys - typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys - typename ValuesInputIteratorT, ///< Random-access input iterator type for values - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered - typename ScanTileStateT, ///< Tile status interface type - typename EqualityOpT, ///< KeyT equality operator type - typename ReductionOpT, ///< ValueT reduction operator type - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS)) -__global__ void DeviceReduceByKeyKernel( - KeysInputIteratorT d_keys_in, ///< Pointer to the input sequence of keys - UniqueOutputIteratorT d_unique_out, ///< Pointer to the output sequence of unique keys (one key per run) - ValuesInputIteratorT d_values_in, ///< Pointer to the input sequence of corresponding values - AggregatesOutputIteratorT d_aggregates_out, ///< Pointer to the output sequence of value aggregates (one aggregate per run) - NumRunsOutputIteratorT d_num_runs_out, ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out) - ScanTileStateT tile_state, ///< Tile status interface - int start_tile, ///< The starting tile for the current grid - EqualityOpT equality_op, ///< KeyT equality operator - ReductionOpT reduction_op, ///< ValueT reduction operator - OffsetT num_items) ///< Total number of items to select from -{ - // Thread block type for reducing tiles of value segments - typedef AgentReduceByKey< - AgentReduceByKeyPolicyT, - KeysInputIteratorT, - UniqueOutputIteratorT, - ValuesInputIteratorT, - AggregatesOutputIteratorT, - NumRunsOutputIteratorT, - EqualityOpT, - ReductionOpT, - OffsetT> - AgentReduceByKeyT; - - // Shared memory for AgentReduceByKey - __shared__ typename AgentReduceByKeyT::TempStorage temp_storage; - - // Process tiles - AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange( - num_items, - tile_state, - start_tile); -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey - */ -template < - typename KeysInputIteratorT, ///< Random-access input iterator type for keys - typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys - typename ValuesInputIteratorT, ///< Random-access input iterator type for values - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered - typename EqualityOpT, ///< KeyT equality operator type - typename ReductionOpT, ///< ValueT reduction operator type - typename OffsetT> ///< Signed integer type for global offsets -struct DispatchReduceByKey -{ - //------------------------------------------------------------------------- - // Types and constants - //------------------------------------------------------------------------- - - // The input keys type - typedef typename std::iterator_traits::value_type KeyInputT; - - // The output keys type - typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type - - // The input values type - typedef typename std::iterator_traits::value_type ValueInputT; - - // The output values type - typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type - - enum - { - INIT_KERNEL_THREADS = 128, - MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)), - COMBINED_INPUT_BYTES = sizeof(KeyOutputT) + sizeof(ValueOutputT), - }; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - - //------------------------------------------------------------------------- - // Tuning policies - //------------------------------------------------------------------------- - - /// SM35 - struct Policy350 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 6, - ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef AgentReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicyT; - }; - - /// SM30 - struct Policy300 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 6, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef AgentReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicyT; - }; - - /// SM20 - struct Policy200 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 11, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef AgentReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicyT; - }; - - /// SM13 - struct Policy130 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef AgentReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicyT; - }; - - /// SM11 - struct Policy110 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 5, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)), - }; - - typedef AgentReduceByKeyPolicy< - 64, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_RAKING> - ReduceByKeyPolicyT; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy110 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &reduce_by_key_config) - { - #if (CUB_PTX_ARCH > 0) - (void)ptx_version; - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - reduce_by_key_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 300) - { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 200) - { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 130) - { - reduce_by_key_config.template Init(); - } - else - { - reduce_by_key_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int tile_items; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = PolicyT::BLOCK_THREADS; - items_per_thread = PolicyT::ITEMS_PER_THREAD; - tile_items = block_threads * items_per_thread; - } - }; - - - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- - - /** - * Internal dispatch routine for computing a device-wide reduce-by-key using the - * specified kernel functions. - */ - template < - typename ScanInitKernelT, ///< Function type of cub::DeviceScanInitKernel - typename ReduceByKeyKernelT> ///< Function type of cub::DeviceReduceByKeyKernelT - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys - UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) - ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values - AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) - EqualityOpT equality_op, ///< [in] KeyT equality operator - ReductionOpT reduction_op, ///< [in] ValueT reduction operator - OffsetT num_items, ///< [in] Total number of items to select from - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int /*ptx_version*/, ///< [in] PTX version of dispatch kernels - ScanInitKernelT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel - ReduceByKeyKernelT reduce_by_key_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel - KernelConfig reduce_by_key_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for - { - -#ifndef CUB_RUNTIME_ENABLED - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_keys_in; - (void)d_unique_out; - (void)d_values_in; - (void)d_aggregates_out; - (void)d_num_runs_out; - (void)equality_op; - (void)reduction_op; - (void)num_items; - (void)stream; - (void)debug_synchronous; - (void)init_kernel; - (void)reduce_by_key_kernel; - (void)reduce_by_key_config; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Number of input tiles - int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; - - // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors - - // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[1]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the tile status interface - ScanTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; - - // Log init_kernel configuration - int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); - if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); - - // Invoke init_kernel to initialize tile descriptors - init_kernel<<>>( - tile_state, - num_tiles, - d_num_runs_out); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Return if empty problem - if (num_items == 0) - break; - - // Get SM occupancy for reduce_by_key_kernel - int reduce_by_key_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - reduce_by_key_sm_occupancy, // out - reduce_by_key_kernel, - reduce_by_key_config.block_threads))) break; - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; - - // Run grids in epochs (in case number of tiles exceeds max x-dimension - int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); - for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) - { - // Log reduce_by_key_kernel configuration - if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy); - - // Invoke reduce_by_key_kernel - reduce_by_key_kernel<<>>( - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - tile_state, - start_tile, - equality_op, - reduction_op, - num_items); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys - UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) - ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values - AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) - EqualityOpT equality_op, ///< [in] KeyT equality operator - ReductionOpT reduction_op, ///< [in] ValueT reduction operator - OffsetT num_items, ///< [in] Total number of items to select from - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig reduce_by_key_config; - InitConfigs(ptx_version, reduce_by_key_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - equality_op, - reduction_op, - num_items, - stream, - debug_synchronous, - ptx_version, - DeviceCompactInitKernel, - DeviceReduceByKeyKernel, - reduce_by_key_config))) break; - } - while (0); - - return error; - } -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_rle.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_rle.cuh deleted file mode 100644 index 1de979e..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_rle.cuh +++ /dev/null @@ -1,538 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch_scan.cuh" -#include "../../agent/agent_rle.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Select kernel entry point (multi-block) - * - * Performs functor-based selection if SelectOp functor type != NullType - * Otherwise performs flag-based selection if FlagIterator's value type != NullType - * Otherwise performs discontinuity selection (keep unique) - */ -template < - typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator - typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator - typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator - typename ScanTileStateT, ///< Tile status interface type - typename EqualityOpT, ///< T equality operator type - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS)) -__global__ void DeviceRleSweepKernel( - InputIteratorT d_in, ///< [in] Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets - LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) - ScanTileStateT tile_status, ///< [in] Tile status interface - EqualityOpT equality_op, ///< [in] Equality operator for input items - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - int num_tiles) ///< [in] Total number of tiles for the entire problem -{ - // Thread block type for selecting data from input tiles - typedef AgentRle< - AgentRlePolicyT, - InputIteratorT, - OffsetsOutputIteratorT, - LengthsOutputIteratorT, - EqualityOpT, - OffsetT> AgentRleT; - - // Shared memory for AgentRle - __shared__ typename AgentRleT::TempStorage temp_storage; - - // Process tiles - AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange( - num_tiles, - tile_status, - d_num_runs_out); -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceRle - */ -template < - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator - typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator - typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator - typename EqualityOpT, ///< T equality operator type - typename OffsetT> ///< Signed integer type for global offsets -struct DeviceRleDispatch -{ - /****************************************************************************** - * Types and constants - ******************************************************************************/ - - // The input value type - typedef typename std::iterator_traits::value_type T; - - // The lengths output value type - typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? - OffsetT, // ... then the OffsetT type, - typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type - - enum - { - INIT_KERNEL_THREADS = 128, - }; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - - /****************************************************************************** - * Tuning policies - ******************************************************************************/ - - /// SM35 - struct Policy350 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 15, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef AgentRlePolicy< - 96, - ITEMS_PER_THREAD, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - true, - BLOCK_SCAN_WARP_SCANS> - RleSweepPolicy; - }; - - /// SM30 - struct Policy300 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 5, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef AgentRlePolicy< - 256, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_RAKING_MEMOIZE> - RleSweepPolicy; - }; - - /// SM20 - struct Policy200 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 15, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef AgentRlePolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS> - RleSweepPolicy; - }; - - /// SM13 - struct Policy130 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef AgentRlePolicy< - 64, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_RAKING_MEMOIZE> - RleSweepPolicy; - }; - - /// SM10 - struct Policy100 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef AgentRlePolicy< - 256, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_RAKING_MEMOIZE> - RleSweepPolicy; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig& device_rle_config) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - device_rle_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - device_rle_config.template Init(); - } - else if (ptx_version >= 300) - { - device_rle_config.template Init(); - } - else if (ptx_version >= 200) - { - device_rle_config.template Init(); - } - else if (ptx_version >= 130) - { - device_rle_config.template Init(); - } - else - { - device_rle_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. Mirrors the constants within AgentRlePolicyT. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - BlockLoadAlgorithm load_policy; - bool store_warp_time_slicing; - BlockScanAlgorithm scan_algorithm; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = AgentRlePolicyT::BLOCK_THREADS; - items_per_thread = AgentRlePolicyT::ITEMS_PER_THREAD; - load_policy = AgentRlePolicyT::LOAD_ALGORITHM; - store_warp_time_slicing = AgentRlePolicyT::STORE_WARP_TIME_SLICING; - scan_algorithm = AgentRlePolicyT::SCAN_ALGORITHM; - } - - CUB_RUNTIME_FUNCTION __forceinline__ - void Print() - { - printf("%d, %d, %d, %d, %d", - block_threads, - items_per_thread, - load_policy, - store_warp_time_slicing, - scan_algorithm); - } - }; - - - /****************************************************************************** - * Dispatch entrypoints - ******************************************************************************/ - - /** - * Internal dispatch routine for computing a device-wide run-length-encode using the - * specified kernel functions. - */ - template < - typename DeviceScanInitKernelPtr, ///< Function type of cub::DeviceScanInitKernel - typename DeviceRleSweepKernelPtr> ///< Function type of cub::DeviceRleSweepKernelPtr - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to the output sequence of run-offsets - LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to the output sequence of run-lengths - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out) - EqualityOpT equality_op, ///< [in] Equality operator for input items - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version, ///< [in] PTX version of dispatch kernels - DeviceScanInitKernelPtr device_scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel - DeviceRleSweepKernelPtr device_rle_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel - KernelConfig device_rle_config) ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for - { - -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Number of input tiles - int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; - - // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors - - // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[1]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the tile status interface - ScanTileStateT tile_status; - if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; - - // Log device_scan_init_kernel configuration - int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); - if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); - - // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors - device_scan_init_kernel<<>>( - tile_status, - num_tiles, - d_num_runs_out); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Return if empty problem - if (num_items == 0) - break; - - // Get SM occupancy for device_rle_sweep_kernel - int device_rle_kernel_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - device_rle_kernel_sm_occupancy, // out - device_rle_sweep_kernel, - device_rle_config.block_threads))) break; - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; - - // Get grid size for scanning tiles - dim3 scan_grid_size; - scan_grid_size.z = 1; - scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; - scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); - - // Log device_rle_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy); - - // Invoke device_rle_sweep_kernel - device_rle_sweep_kernel<<>>( - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - tile_status, - equality_op, - num_items, - num_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets - LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) - EqualityOpT equality_op, ///< [in] Equality operator for input items - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig device_rle_config; - InitConfigs(ptx_version, device_rle_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - equality_op, - num_items, - stream, - debug_synchronous, - ptx_version, - DeviceCompactInitKernel, - DeviceRleSweepKernel, - device_rle_config))) break; - } - while (0); - - return error; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_scan.cuh deleted file mode 100644 index 8944dcd..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_scan.cuh +++ /dev/null @@ -1,563 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "../../agent/agent_scan.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_arch.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Initialization kernel for tile status initialization (multi-block) - */ -template < - typename ScanTileStateT> ///< Tile status interface type -__global__ void DeviceScanInitKernel( - ScanTileStateT tile_state, ///< [in] Tile status interface - int num_tiles) ///< [in] Number of tiles -{ - // Initialize tile status - tile_state.InitializeStatus(num_tiles); -} - -/** - * Initialization kernel for tile status initialization (multi-block) - */ -template < - typename ScanTileStateT, ///< Tile status interface type - typename NumSelectedIteratorT> ///< Output iterator type for recording the number of items selected -__global__ void DeviceCompactInitKernel( - ScanTileStateT tile_state, ///< [in] Tile status interface - int num_tiles, ///< [in] Number of tiles - NumSelectedIteratorT d_num_selected_out) ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) -{ - // Initialize tile status - tile_state.InitializeStatus(num_tiles); - - // Initialize d_num_selected_out - if ((blockIdx.x == 0) && (threadIdx.x == 0)) - *d_num_selected_out = 0; -} - - -/** - * Scan kernel entry point (multi-block) - */ -template < - typename ScanPolicyT, ///< Parameterized ScanPolicyT tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator - typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator - typename ScanTileStateT, ///< Tile status interface type - typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) - typename InitValueT, ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans) - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS)) -__global__ void DeviceScanKernel( - InputIteratorT d_in, ///< Input data - OutputIteratorT d_out, ///< Output data - ScanTileStateT tile_state, ///< Tile status interface - int start_tile, ///< The starting tile for the current grid - ScanOpT scan_op, ///< Binary scan functor - InitValueT init_value, ///< Initial value to seed the exclusive scan - OffsetT num_items) ///< Total number of scan items for the entire problem -{ - // Thread block type for scanning input tiles - typedef AgentScan< - ScanPolicyT, - InputIteratorT, - OutputIteratorT, - ScanOpT, - InitValueT, - OffsetT> AgentScanT; - - // Shared memory for AgentScan - __shared__ typename AgentScanT::TempStorage temp_storage; - - // Process tiles - AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange( - num_items, - tile_state, - start_tile); -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceScan - */ -template < - typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator - typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator - typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) - typename InitValueT, ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans) - typename OffsetT> ///< Signed integer type for global offsets -struct DispatchScan -{ - //--------------------------------------------------------------------- - // Constants and Types - //--------------------------------------------------------------------- - - enum - { - INIT_KERNEL_THREADS = 128 - }; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // Tile status descriptor interface type - typedef ScanTileState ScanTileStateT; - - - //--------------------------------------------------------------------- - // Tuning policies - //--------------------------------------------------------------------- - - /// SM600 - struct Policy600 - { - typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(128, 15, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_TRANSPOSE, - BLOCK_SCAN_WARP_SCANS> - ScanPolicyT; - }; - - - /// SM520 - struct Policy520 - { - // Titan X: 32.47B items/s @ 48M 32-bit T - typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(128, 12, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_DIRECT, - LOAD_LDG, - BLOCK_STORE_WARP_TRANSPOSE, - BLOCK_SCAN_WARP_SCANS> - ScanPolicyT; - }; - - - /// SM35 - struct Policy350 - { - // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T - typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(128, 12, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_DIRECT, - LOAD_LDG, - BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, - BLOCK_SCAN_RAKING> - ScanPolicyT; - }; - - /// SM30 - struct Policy300 - { - typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(256, 9, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - BLOCK_SCAN_WARP_SCANS> - ScanPolicyT; - }; - - /// SM20 - struct Policy200 - { - // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T - typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(128, 12, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - BLOCK_SCAN_WARP_SCANS> - ScanPolicyT; - }; - - /// SM13 - struct Policy130 - { - typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(96, 21, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - BLOCK_SCAN_RAKING_MEMOIZE> - ScanPolicyT; - }; - - /// SM10 - struct Policy100 - { - typedef AgentScanPolicy< - CUB_NOMINAL_CONFIG(64, 9, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - BLOCK_SCAN_WARP_SCANS> - ScanPolicyT; - }; - - - //--------------------------------------------------------------------- - // Tuning policies of current PTX compiler pass - //--------------------------------------------------------------------- - -#if (CUB_PTX_ARCH >= 600) - typedef Policy600 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 520) - typedef Policy520 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {}; - - - //--------------------------------------------------------------------- - // Utilities - //--------------------------------------------------------------------- - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &scan_kernel_config) - { - #if (CUB_PTX_ARCH > 0) - (void)ptx_version; - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - scan_kernel_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 600) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 520) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 350) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 300) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 200) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 130) - { - scan_kernel_config.template Init(); - } - else - { - scan_kernel_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int tile_items; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = PolicyT::BLOCK_THREADS; - items_per_thread = PolicyT::ITEMS_PER_THREAD; - tile_items = block_threads * items_per_thread; - } - }; - - - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- - - /** - * Internal dispatch routine for computing a device-wide prefix scan using the - * specified kernel functions. - */ - template < - typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel - typename ScanSweepKernelPtrT> ///< Function type of cub::DeviceScanKernelPtrT - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - ScanOpT scan_op, ///< [in] Binary scan functor - InitValueT init_value, ///< [in] Initial value to seed the exclusive scan - OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int /*ptx_version*/, ///< [in] PTX version of dispatch kernels - ScanInitKernelPtrT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel - ScanSweepKernelPtrT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel - KernelConfig scan_kernel_config) ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for - { - -#ifndef CUB_RUNTIME_ENABLED - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_in; - (void)d_out; - (void)scan_op; - (void)init_value; - (void)num_items; - (void)stream; - (void)debug_synchronous; - (void)init_kernel; - (void)scan_kernel; - (void)scan_kernel_config; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Number of input tiles - int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; - - // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors - - // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[1]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Return if empty problem - if (num_items == 0) - break; - - // Construct the tile status interface - ScanTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; - - // Log init_kernel configuration - int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; - if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); - - // Invoke init_kernel to initialize tile descriptors - init_kernel<<>>( - tile_state, - num_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Get SM occupancy for scan_kernel - int scan_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - scan_sm_occupancy, // out - scan_kernel, - scan_kernel_config.block_threads))) break; - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; - - // Run grids in epochs (in case number of tiles exceeds max x-dimension - int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); - for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) - { - // Log scan_kernel configuration - if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy); - - // Invoke scan_kernel - scan_kernel<<>>( - d_in, - d_out, - tile_state, - start_tile, - scan_op, - init_value, - num_items); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - ScanOpT scan_op, ///< [in] Binary scan functor - InitValueT init_value, ///< [in] Initial value to seed the exclusive scan - OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Get kernel kernel dispatch configurations - KernelConfig scan_kernel_config; - InitConfigs(ptx_version, scan_kernel_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - init_value, - num_items, - stream, - debug_synchronous, - ptx_version, - DeviceScanInitKernel, - DeviceScanKernel, - scan_kernel_config))) break; - } - while (0); - - return error; - } -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_select_if.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_select_if.cuh deleted file mode 100644 index 6f03319..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_select_if.cuh +++ /dev/null @@ -1,542 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch_scan.cuh" -#include "../../agent/agent_select_if.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Select kernel entry point (multi-block) - * - * Performs functor-based selection if SelectOpT functor type != NullType - * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType - * Otherwise performs discontinuity selection (keep unique) - */ -template < - typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicyT tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for reading input items - typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items - typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected - typename ScanTileStateT, ///< Tile status interface type - typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) - typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) - typename OffsetT, ///< Signed integer type for global offsets - bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output -__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS)) -__global__ void DeviceSelectSweepKernel( - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) - SelectedOutputIteratorT d_selected_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) - ScanTileStateT tile_status, ///< [in] Tile status interface - SelectOpT select_op, ///< [in] Selection operator - EqualityOpT equality_op, ///< [in] Equality operator - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - int num_tiles) ///< [in] Total number of tiles for the entire problem -{ - // Thread block type for selecting data from input tiles - typedef AgentSelectIf< - AgentSelectIfPolicyT, - InputIteratorT, - FlagsInputIteratorT, - SelectedOutputIteratorT, - SelectOpT, - EqualityOpT, - OffsetT, - KEEP_REJECTS> AgentSelectIfT; - - // Shared memory for AgentSelectIf - __shared__ typename AgentSelectIfT::TempStorage temp_storage; - - // Process tiles - AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange( - num_tiles, - tile_status, - d_num_selected_out); -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect - */ -template < - typename InputIteratorT, ///< Random-access input iterator type for reading input items - typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items - typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected - typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) - typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) - typename OffsetT, ///< Signed integer type for global offsets - bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output -struct DispatchSelectIf -{ - /****************************************************************************** - * Types and constants - ******************************************************************************/ - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // The flag value type - typedef typename std::iterator_traits::value_type FlagT; - - enum - { - INIT_KERNEL_THREADS = 128, - }; - - // Tile status descriptor interface type - typedef ScanTileState ScanTileStateT; - - - /****************************************************************************** - * Tuning policies - ******************************************************************************/ - - /// SM35 - struct Policy350 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 10, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), - }; - - typedef AgentSelectIfPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - BLOCK_SCAN_WARP_SCANS> - SelectIfPolicyT; - }; - - /// SM30 - struct Policy300 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), - }; - - typedef AgentSelectIfPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - SelectIfPolicyT; - }; - - /// SM20 - struct Policy200 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), - }; - - typedef AgentSelectIfPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - SelectIfPolicyT; - }; - - /// SM13 - struct Policy130 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), - }; - - typedef AgentSelectIfPolicy< - 64, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_RAKING_MEMOIZE> - SelectIfPolicyT; - }; - - /// SM10 - struct Policy100 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), - }; - - typedef AgentSelectIfPolicy< - 64, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_RAKING> - SelectIfPolicyT; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &select_if_config) - { - #if (CUB_PTX_ARCH > 0) - (void)ptx_version; - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - select_if_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - select_if_config.template Init(); - } - else if (ptx_version >= 300) - { - select_if_config.template Init(); - } - else if (ptx_version >= 200) - { - select_if_config.template Init(); - } - else if (ptx_version >= 130) - { - select_if_config.template Init(); - } - else - { - select_if_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int tile_items; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = PolicyT::BLOCK_THREADS; - items_per_thread = PolicyT::ITEMS_PER_THREAD; - tile_items = block_threads * items_per_thread; - } - }; - - - /****************************************************************************** - * Dispatch entrypoints - ******************************************************************************/ - - /** - * Internal dispatch routine for computing a device-wide selection using the - * specified kernel functions. - */ - template < - typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel - typename SelectIfKernelPtrT> ///< Function type of cub::SelectIfKernelPtrT - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) - SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) - SelectOpT select_op, ///< [in] Selection operator - EqualityOpT equality_op, ///< [in] Equality operator - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int /*ptx_version*/, ///< [in] PTX version of dispatch kernels - ScanInitKernelPtrT scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel - SelectIfKernelPtrT select_if_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel - KernelConfig select_if_config) ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for - { - -#ifndef CUB_RUNTIME_ENABLED - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_in; - (void)d_flags; - (void)d_selected_out; - (void)d_num_selected_out; - (void)select_op; - (void)equality_op; - (void)num_items; - (void)stream; - (void)debug_synchronous; - (void)scan_init_kernel; - (void)select_if_kernel; - (void)select_if_config; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Number of input tiles - int tile_size = select_if_config.block_threads * select_if_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; - - // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors - - // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[1]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the tile status interface - ScanTileStateT tile_status; - if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; - - // Log scan_init_kernel configuration - int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); - if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); - - // Invoke scan_init_kernel to initialize tile descriptors - scan_init_kernel<<>>( - tile_status, - num_tiles, - d_num_selected_out); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Return if empty problem - if (num_items == 0) - break; - - // Get SM occupancy for select_if_kernel - int range_select_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - range_select_sm_occupancy, // out - select_if_kernel, - select_if_config.block_threads))) break; - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; - - // Get grid size for scanning tiles - dim3 scan_grid_size; - scan_grid_size.z = 1; - scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; - scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); - - // Log select_if_kernel configuration - if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy); - - // Invoke select_if_kernel - select_if_kernel<<>>( - d_in, - d_flags, - d_selected_out, - d_num_selected_out, - tile_status, - select_op, - equality_op, - num_items, - num_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) - SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) - SelectOpT select_op, ///< [in] Selection operator - EqualityOpT equality_op, ///< [in] Equality operator - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig select_if_config; - InitConfigs(ptx_version, select_if_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_selected_out, - d_num_selected_out, - select_op, - equality_op, - num_items, - stream, - debug_synchronous, - ptx_version, - DeviceCompactInitKernel, - DeviceSelectSweepKernel, - select_if_config))) break; - } - while (0); - - return error; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_spmv_orig.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_spmv_orig.cuh deleted file mode 100644 index 267bbe2..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/device/dispatch/dispatch_spmv_orig.cuh +++ /dev/null @@ -1,850 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). - */ - -#pragma once - -#include -#include - -#include "../../agent/single_pass_scan_operators.cuh" -#include "../../agent/agent_segment_fixup.cuh" -#include "../../agent/agent_spmv_orig.cuh" -#include "../../util_type.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../thread/thread_search.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * SpMV kernel entry points - *****************************************************************************/ - -/** - * Spmv search kernel. Identifies merge path starting coordinates for each tile. - */ -template < - typename AgentSpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type - typename ValueT, ///< Matrix and vector value type - typename OffsetT> ///< Signed integer type for sequence offsets -__global__ void DeviceSpmv1ColKernel( - SpmvParams spmv_params) ///< [in] SpMV input parameter bundle -{ - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, - ValueT, - OffsetT> - VectorValueIteratorT; - - VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); - - int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (row_idx < spmv_params.num_rows) - { - OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; - OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; - - ValueT value = 0.0; - if (end_nonzero_idx != nonzero_idx) - { - value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; - } - - spmv_params.d_vector_y[row_idx] = value; - } -} - - -/** - * Spmv search kernel. Identifies merge path starting coordinates for each tile. - */ -template < - typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type - typename OffsetT, ///< Signed integer type for sequence offsets - typename CoordinateT, ///< Merge path coordinate type - typename SpmvParamsT> ///< SpmvParams type -__global__ void DeviceSpmvSearchKernel( - int num_merge_tiles, ///< [in] Number of SpMV merge tiles (spmv grid size) - CoordinateT* d_tile_coordinates, ///< [out] Pointer to the temporary array of tile starting coordinates - SpmvParamsT spmv_params) ///< [in] SpMV input parameter bundle -{ - /// Constants - enum - { - BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - typedef CacheModifiedInputIterator< - SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, - OffsetT, - OffsetT> - RowOffsetsSearchIteratorT; - - // Find the starting coordinate for all tiles (plus the end coordinate of the last one) - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tile_idx < num_merge_tiles + 1) - { - OffsetT diagonal = (tile_idx * TILE_ITEMS); - CoordinateT tile_coordinate; - CountingInputIterator nonzero_indices(0); - - // Search the merge path - MergePathSearch( - diagonal, - RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), - nonzero_indices, - spmv_params.num_rows, - spmv_params.num_nonzeros, - tile_coordinate); - - // Output starting offset - d_tile_coordinates[tile_idx] = tile_coordinate; - } -} - - -/** - * Spmv agent entry point - */ -template < - typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type - typename ScanTileStateT, ///< Tile status interface type - typename ValueT, ///< Matrix and vector value type - typename OffsetT, ///< Signed integer type for sequence offsets - typename CoordinateT, ///< Merge path coordinate type - bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1 - bool HAS_BETA> ///< Whether the input parameter Beta is 0 -__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS)) -__global__ void DeviceSpmvKernel( - SpmvParams spmv_params, ///< [in] SpMV input parameter bundle - CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates - KeyValuePair* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block - int num_tiles, ///< [in] Number of merge tiles - ScanTileStateT tile_state, ///< [in] Tile status interface for fixup reduce-by-key kernel - int num_segment_fixup_tiles) ///< [in] Number of reduce-by-key tiles (fixup grid size) -{ - // Spmv agent type specialization - typedef AgentSpmv< - SpmvPolicyT, - ValueT, - OffsetT, - HAS_ALPHA, - HAS_BETA> - AgentSpmvT; - - // Shared memory for AgentSpmv - __shared__ typename AgentSpmvT::TempStorage temp_storage; - - AgentSpmvT(temp_storage, spmv_params).ConsumeTile( - d_tile_coordinates, - d_tile_carry_pairs, - num_tiles); - - // Initialize fixup tile status - tile_state.InitializeStatus(num_segment_fixup_tiles); - -} - - -/** - * Multi-block reduce-by-key sweep kernel entry point - */ -template < - typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type - typename PairsInputIteratorT, ///< Random-access input iterator type for keys - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename OffsetT, ///< Signed integer type for global offsets - typename ScanTileStateT> ///< Tile status interface type -__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) -__global__ void DeviceSegmentFixupKernel( - PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block - AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates - OffsetT num_items, ///< [in] Total number of items to select from - int num_tiles, ///< [in] Total number of tiles for the entire problem - ScanTileStateT tile_state) ///< [in] Tile status interface -{ - // Thread block type for reducing tiles of value segments - typedef AgentSegmentFixup< - AgentSegmentFixupPolicyT, - PairsInputIteratorT, - AggregatesOutputIteratorT, - cub::Equality, - cub::Sum, - OffsetT> - AgentSegmentFixupT; - - // Shared memory for AgentSegmentFixup - __shared__ typename AgentSegmentFixupT::TempStorage temp_storage; - - // Process tiles - AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange( - num_items, - num_tiles, - tile_state); -} - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv - */ -template < - typename ValueT, ///< Matrix and vector value type - typename OffsetT> ///< Signed integer type for global offsets -struct DispatchSpmv -{ - //--------------------------------------------------------------------- - // Constants and Types - //--------------------------------------------------------------------- - - enum - { - INIT_KERNEL_THREADS = 128 - }; - - // SpmvParams bundle type - typedef SpmvParams SpmvParamsT; - - // 2D merge path coordinate type - typedef typename CubVector::Type CoordinateT; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - // Tuple type for scanning (pairs accumulated segment-value with segment-index) - typedef KeyValuePair KeyValuePairT; - - - //--------------------------------------------------------------------- - // Tuning policies - //--------------------------------------------------------------------- - - /// SM11 - struct Policy110 - { - typedef AgentSpmvPolicy< - 128, - 1, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS> - SpmvPolicyT; - - typedef AgentSegmentFixupPolicy< - 128, - 4, - BLOCK_LOAD_VECTORIZE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - SegmentFixupPolicyT; - }; - - /// SM20 - struct Policy200 - { - typedef AgentSpmvPolicy< - 96, - 18, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_RAKING> - SpmvPolicyT; - - typedef AgentSegmentFixupPolicy< - 128, - 4, - BLOCK_LOAD_VECTORIZE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - SegmentFixupPolicyT; - - }; - - - - /// SM30 - struct Policy300 - { - typedef AgentSpmvPolicy< - 96, - 6, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS> - SpmvPolicyT; - - typedef AgentSegmentFixupPolicy< - 128, - 4, - BLOCK_LOAD_VECTORIZE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - SegmentFixupPolicyT; - - }; - - - /// SM35 - struct Policy350 - { - typedef AgentSpmvPolicy< - (sizeof(ValueT) > 4) ? 96 : 128, - (sizeof(ValueT) > 4) ? 4 : 7, - LOAD_LDG, - LOAD_CA, - LOAD_LDG, - LOAD_LDG, - LOAD_LDG, - (sizeof(ValueT) > 4) ? true : false, - BLOCK_SCAN_WARP_SCANS> - SpmvPolicyT; - - typedef AgentSegmentFixupPolicy< - 128, - 3, - BLOCK_LOAD_VECTORIZE, - LOAD_LDG, - BLOCK_SCAN_WARP_SCANS> - SegmentFixupPolicyT; - }; - - - /// SM37 - struct Policy370 - { - - typedef AgentSpmvPolicy< - (sizeof(ValueT) > 4) ? 128 : 128, - (sizeof(ValueT) > 4) ? 9 : 14, - LOAD_LDG, - LOAD_CA, - LOAD_LDG, - LOAD_LDG, - LOAD_LDG, - false, - BLOCK_SCAN_WARP_SCANS> - SpmvPolicyT; - - typedef AgentSegmentFixupPolicy< - 128, - 3, - BLOCK_LOAD_VECTORIZE, - LOAD_LDG, - BLOCK_SCAN_WARP_SCANS> - SegmentFixupPolicyT; - }; - - /// SM50 - struct Policy500 - { - typedef AgentSpmvPolicy< - (sizeof(ValueT) > 4) ? 64 : 128, - (sizeof(ValueT) > 4) ? 6 : 7, - LOAD_LDG, - LOAD_DEFAULT, - (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, - (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, - LOAD_LDG, - (sizeof(ValueT) > 4) ? true : false, - (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE> - SpmvPolicyT; - - - typedef AgentSegmentFixupPolicy< - 128, - 3, - BLOCK_LOAD_VECTORIZE, - LOAD_LDG, - BLOCK_SCAN_RAKING_MEMOIZE> - SegmentFixupPolicyT; - }; - - - - //--------------------------------------------------------------------- - // Tuning policies of current PTX compiler pass - //--------------------------------------------------------------------- - -#if (CUB_PTX_ARCH >= 500) - typedef Policy500 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 370) - typedef Policy370 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#else - typedef Policy110 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {}; - struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {}; - - - //--------------------------------------------------------------------- - // Utilities - //--------------------------------------------------------------------- - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &spmv_config, - KernelConfig &segment_fixup_config) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - spmv_config.template Init(); - segment_fixup_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 500) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 370) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 350) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 300) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - - } - else if (ptx_version >= 200) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int tile_items; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = PolicyT::BLOCK_THREADS; - items_per_thread = PolicyT::ITEMS_PER_THREAD; - tile_items = block_threads * items_per_thread; - } - }; - - - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- - - /** - * Internal dispatch routine for computing a device-wide reduction using the - * specified kernel functions. - * - * If the input is larger than a single tile, this method uses two-passes of - * kernel invocations. - */ - template < - typename Spmv1ColKernelT, ///< Function type of cub::DeviceSpmv1ColKernel - typename SpmvSearchKernelT, ///< Function type of cub::AgentSpmvSearchKernel - typename SpmvKernelT, ///< Function type of cub::AgentSpmvKernel - typename SegmentFixupKernelT> ///< Function type of cub::DeviceSegmentFixupKernelT - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SpmvParamsT& spmv_params, ///< SpMV input parameter bundle - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - Spmv1ColKernelT spmv_1col_kernel, ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel - SpmvSearchKernelT spmv_search_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel - SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel - SegmentFixupKernelT segment_fixup_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel - KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for - KernelConfig segment_fixup_config) ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for - { -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); - -#else - cudaError error = cudaSuccess; - do - { - if (spmv_params.num_cols == 1) - { - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - temp_storage_bytes = 1; - break; - } - - // Get search/init grid dims - int degen_col_kernel_block_size = INIT_KERNEL_THREADS; - int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size; - - if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", - degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); - - // Invoke spmv_search_kernel - spmv_1col_kernel<<>>( - spmv_params); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - break; - } - - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; - - // Total number of spmv work items - int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros; - - // Tile sizes of kernels - int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread; - int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread; - - // Number of tiles for kernels - unsigned int num_merge_tiles = (num_merge_items + merge_tile_size - 1) / merge_tile_size; - unsigned int num_segment_fixup_tiles = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size; - - // Get SM occupancy for kernels - int spmv_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - spmv_sm_occupancy, - spmv_kernel, - spmv_config.block_threads))) break; - - int segment_fixup_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - segment_fixup_sm_occupancy, - segment_fixup_kernel, - segment_fixup_config.block_threads))) break; - - // Get grid dimensions - dim3 spmv_grid_size( - CUB_MIN(num_merge_tiles, max_dim_x), - (num_merge_tiles + max_dim_x - 1) / max_dim_x, - 1); - - dim3 segment_fixup_grid_size( - CUB_MIN(num_segment_fixup_tiles, max_dim_x), - (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x, - 1); - - // Get the temporary storage allocation requirements - size_t allocation_sizes[3]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break; // bytes needed for reduce-by-key tile status descriptors - allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs - allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - void* allocations[3]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the tile status interface - ScanTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break; - - // Alias the other allocations - KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs - CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates - - // Get search/init grid dims - int search_block_size = INIT_KERNEL_THREADS; - int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size; - -#if (CUB_PTX_ARCH == 0) - // Init textures - if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break; -#endif - - if (search_grid_size < sm_count) -// if (num_merge_tiles < spmv_sm_occupancy * sm_count) - { - // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords - d_tile_coordinates = NULL; - } - else - { - // Use separate search kernel if we have enough spmv tiles to saturate the device - - // Log spmv_search_kernel configuration - if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", - search_grid_size, search_block_size, (long long) stream); - - // Invoke spmv_search_kernel - spmv_search_kernel<<>>( - num_merge_tiles, - d_tile_coordinates, - spmv_params); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - - // Log spmv_kernel configuration - if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); - - // Invoke spmv_kernel - spmv_kernel<<>>( - spmv_params, - d_tile_coordinates, - d_tile_carry_pairs, - num_merge_tiles, - tile_state, - num_segment_fixup_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Run reduce-by-key fixup if necessary - if (num_merge_tiles > 1) - { - // Log segment_fixup_kernel configuration - if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); - - // Invoke segment_fixup_kernel - segment_fixup_kernel<<>>( - d_tile_carry_pairs, - spmv_params.d_vector_y, - num_merge_tiles, - num_segment_fixup_tiles, - tile_state); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - -#if (CUB_PTX_ARCH == 0) - // Free textures - if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break; -#endif - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine for computing a device-wide reduction - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SpmvParamsT& spmv_params, ///< SpMV input parameter bundle - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig spmv_config, segment_fixup_config; - InitConfigs(ptx_version, spmv_config, segment_fixup_config); - - if (CubDebug(error = Dispatch( - d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, - DeviceSpmv1ColKernel, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - spmv_config, segment_fixup_config))) break; - -/* - // Dispatch - if (spmv_params.beta == 0.0) - { - if (spmv_params.alpha == 1.0) - { - // Dispatch y = A*x - if (CubDebug(error = Dispatch( - d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, - DeviceSpmv1ColKernel, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - spmv_config, segment_fixup_config))) break; - } - else - { - // Dispatch y = alpha*A*x - if (CubDebug(error = Dispatch( - d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - spmv_config, segment_fixup_config))) break; - } - } - else - { - if (spmv_params.alpha == 1.0) - { - // Dispatch y = A*x + beta*y - if (CubDebug(error = Dispatch( - d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - spmv_config, segment_fixup_config))) break; - } - else - { - // Dispatch y = alpha*A*x + beta*y - if (CubDebug(error = Dispatch( - d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - spmv_config, segment_fixup_config))) break; - } - } -*/ - } - while (0); - - return error; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_barrier.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_barrier.cuh deleted file mode 100644 index d9f8336..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_barrier.cuh +++ /dev/null @@ -1,211 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid - */ - -#pragma once - -#include "../util_debug.cuh" -#include "../util_namespace.cuh" -#include "../thread/thread_load.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup GridModule - * @{ - */ - - -/** - * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid - */ -class GridBarrier -{ -protected : - - typedef unsigned int SyncFlag; - - // Counters in global device memory - SyncFlag* d_sync; - -public: - - /** - * Constructor - */ - GridBarrier() : d_sync(NULL) {} - - - /** - * Synchronize - */ - __device__ __forceinline__ void Sync() const - { - volatile SyncFlag *d_vol_sync = d_sync; - - // Threadfence and syncthreads to make sure global writes are visible before - // thread-0 reports in with its sync counter - __threadfence(); - CTA_SYNC(); - - if (blockIdx.x == 0) - { - // Report in ourselves - if (threadIdx.x == 0) - { - d_vol_sync[blockIdx.x] = 1; - } - - CTA_SYNC(); - - // Wait for everyone else to report in - for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) - { - while (ThreadLoad(d_sync + peer_block) == 0) - { - __threadfence_block(); - } - } - - CTA_SYNC(); - - // Let everyone know it's safe to proceed - for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) - { - d_vol_sync[peer_block] = 0; - } - } - else - { - if (threadIdx.x == 0) - { - // Report in - d_vol_sync[blockIdx.x] = 1; - - // Wait for acknowledgment - while (ThreadLoad(d_sync + blockIdx.x) == 1) - { - __threadfence_block(); - } - } - - CTA_SYNC(); - } - } -}; - - -/** - * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. - * - * Uses RAII for lifetime, i.e., device resources are reclaimed when - * the destructor is called. - */ -class GridBarrierLifetime : public GridBarrier -{ -protected: - - // Number of bytes backed by d_sync - size_t sync_bytes; - -public: - - /** - * Constructor - */ - GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} - - - /** - * DeviceFrees and resets the progress counters - */ - cudaError_t HostReset() - { - cudaError_t retval = cudaSuccess; - if (d_sync) - { - CubDebug(retval = cudaFree(d_sync)); - d_sync = NULL; - } - sync_bytes = 0; - return retval; - } - - - /** - * Destructor - */ - virtual ~GridBarrierLifetime() - { - HostReset(); - } - - - /** - * Sets up the progress counters for the next kernel launch (lazily - * allocating and initializing them if necessary) - */ - cudaError_t Setup(int sweep_grid_size) - { - cudaError_t retval = cudaSuccess; - do { - size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); - if (new_sync_bytes > sync_bytes) - { - if (d_sync) - { - if (CubDebug(retval = cudaFree(d_sync))) break; - } - - sync_bytes = new_sync_bytes; - - // Allocate and initialize to zero - if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; - if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; - } - } while (0); - - return retval; - } -}; - - -/** @} */ // end group GridModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_even_share.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_even_share.cuh deleted file mode 100644 index 3ba29da..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_even_share.cuh +++ /dev/null @@ -1,222 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units (grains). - */ - - -#pragma once - -#include "../util_namespace.cuh" -#include "../util_macro.cuh" -#include "grid_mapping.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup GridModule - * @{ - */ - - -/** - * \brief GridEvenShare is a descriptor utility for distributing input among - * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly - * the same number of input tiles. - * - * \par Overview - * Each thread block is assigned a consecutive sequence of input tiles. To help - * preserve alignment and eliminate the overhead of guarded loads for all but the - * last thread block, to GridEvenShare assigns one of three different amounts of - * work to a given thread block: "big", "normal", or "last". The "big" workloads - * are one scheduling grain larger than "normal". The "last" work unit for the - * last thread block may be partially-full if the input is not an even multiple of - * the scheduling grain size. - * - * \par - * Before invoking a child grid, a parent thread will typically construct an - * instance of GridEvenShare. The instance can be passed to child thread blocks - * which can initialize their per-thread block offsets using \p BlockInit(). - */ -template -struct GridEvenShare -{ -private: - - OffsetT total_tiles; - int big_shares; - OffsetT big_share_items; - OffsetT normal_share_items; - OffsetT normal_base_offset; - -public: - - /// Total number of input items - OffsetT num_items; - - /// Grid size in thread blocks - int grid_size; - - /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles - OffsetT block_offset; - - /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles - OffsetT block_end; - - /// Stride between input tiles - OffsetT block_stride; - - - /** - * \brief Constructor. - */ - __host__ __device__ __forceinline__ GridEvenShare() : - total_tiles(0), - big_shares(0), - big_share_items(0), - normal_share_items(0), - normal_base_offset(0), - num_items(0), - grid_size(0), - block_offset(0), - block_end(0), - block_stride(0) - {} - - - /** - * \brief Dispatch initializer. To be called prior prior to kernel launch. - */ - __host__ __device__ __forceinline__ void DispatchInit( - OffsetT num_items, ///< Total number of input items - int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) - int tile_items) ///< Number of data items per input tile - { - this->block_offset = num_items; // Initialize past-the-end - this->block_end = num_items; // Initialize past-the-end - this->num_items = num_items; - this->total_tiles = (num_items + tile_items - 1) / tile_items; - this->grid_size = CUB_MIN(total_tiles, max_grid_size); - OffsetT avg_tiles_per_block = total_tiles / grid_size; - this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); // leftover grains go to big blocks - this->normal_share_items = avg_tiles_per_block * tile_items; - this->normal_base_offset = big_shares * tile_items; - this->big_share_items = normal_share_items + tile_items; - } - - - /** - * \brief Initializes ranges for the specified thread block index. Specialized - * for a "raking" access pattern in which each thread block is assigned a - * consecutive sequence of input tiles. - */ - template - __device__ __forceinline__ void BlockInit( - int block_id, - Int2Type /*strategy_tag*/) - { - block_stride = TILE_ITEMS; - if (block_id < big_shares) - { - // This thread block gets a big share of grains (avg_tiles_per_block + 1) - block_offset = (block_id * big_share_items); - block_end = block_offset + big_share_items; - } - else if (block_id < total_tiles) - { - // This thread block gets a normal share of grains (avg_tiles_per_block) - block_offset = normal_base_offset + (block_id * normal_share_items); - block_end = CUB_MIN(num_items, block_offset + normal_share_items); - } - // Else default past-the-end - } - - - /** - * \brief Block-initialization, specialized for a "raking" access - * pattern in which each thread block is assigned a consecutive sequence - * of input tiles. - */ - template - __device__ __forceinline__ void BlockInit( - int block_id, - Int2Type /*strategy_tag*/) - { - block_stride = grid_size * TILE_ITEMS; - block_offset = (block_id * TILE_ITEMS); - block_end = num_items; - } - - - /** - * \brief Block-initialization, specialized for "strip mining" access - * pattern in which the input tiles assigned to each thread block are - * separated by a stride equal to the the extent of the grid. - */ - template < - int TILE_ITEMS, - GridMappingStrategy STRATEGY> - __device__ __forceinline__ void BlockInit() - { - BlockInit(blockIdx.x, Int2Type()); - } - - - /** - * \brief Block-initialization, specialized for a "raking" access - * pattern in which each thread block is assigned a consecutive sequence - * of input tiles. - */ - template - __device__ __forceinline__ void BlockInit( - OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) - OffsetT block_end) ///< [in] Threadblock end offset (exclusive) - { - this->block_offset = block_offset; - this->block_end = block_end; - this->block_stride = TILE_ITEMS; - } - - -}; - - - - - -/** @} */ // end group GridModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_mapping.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_mapping.cuh deleted file mode 100644 index 6cd8920..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_mapping.cuh +++ /dev/null @@ -1,113 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. - */ - -#pragma once - -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup GridModule - * @{ - */ - - -/****************************************************************************** - * Mapping policies - *****************************************************************************/ - - -/** - * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. - */ -enum GridMappingStrategy -{ - /** - * \brief An a "raking" access pattern in which each thread block is - * assigned a consecutive sequence of input tiles - * - * \par Overview - * The input is evenly partitioned into \p p segments, where \p p is - * constant and corresponds loosely to the number of thread blocks that may - * actively reside on the target device. Each segment is comprised of - * consecutive tiles, where a tile is a small, constant-sized unit of input - * to be processed to completion before the thread block terminates or - * obtains more work. The kernel invokes \p p thread blocks, each - * of which iteratively consumes a segment of n/p elements - * in tile-size increments. - */ - GRID_MAPPING_RAKE, - - /** - * \brief An a "strip mining" access pattern in which the input tiles assigned - * to each thread block are separated by a stride equal to the the extent of - * the grid. - * - * \par Overview - * The input is evenly partitioned into \p p sets, where \p p is - * constant and corresponds loosely to the number of thread blocks that may - * actively reside on the target device. Each set is comprised of - * data tiles separated by stride \p tiles, where a tile is a small, - * constant-sized unit of input to be processed to completion before the - * thread block terminates or obtains more work. The kernel invokes \p p - * thread blocks, each of which iteratively consumes a segment of - * n/p elements in tile-size increments. - */ - GRID_MAPPING_STRIP_MINE, - - /** - * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. - * - * \par Overview - * The input is treated as a queue to be dynamically consumed by a grid of - * thread blocks. Work is atomically dequeued in tiles, where a tile is a - * unit of input to be processed to completion before the thread block - * terminates or obtains more work. The grid size \p p is constant, - * loosely corresponding to the number of thread blocks that may actively - * reside on the target device. - */ - GRID_MAPPING_DYNAMIC, -}; - - -/** @} */ // end group GridModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_queue.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_queue.cuh deleted file mode 100644 index f413c6d..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/grid/grid_queue.cuh +++ /dev/null @@ -1,220 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::GridQueue is a descriptor utility for dynamic queue management. - */ - -#pragma once - -#include "../util_namespace.cuh" -#include "../util_debug.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup GridModule - * @{ - */ - - -/** - * \brief GridQueue is a descriptor utility for dynamic queue management. - * - * \par Overview - * GridQueue descriptors provides abstractions for "filling" or - * "draining" globally-shared vectors. - * - * \par - * A "filling" GridQueue works by atomically-adding to a zero-initialized counter, - * returning a unique offset for the calling thread to write its items. - * The GridQueue maintains the total "fill-size". The fill counter must be reset - * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that - * will be filling. - * - * \par - * Similarly, a "draining" GridQueue works by works by atomically-incrementing a - * zero-initialized counter, returning a unique offset for the calling thread to - * read its items. Threads can safely drain until the array's logical fill-size is - * exceeded. The drain counter must be reset using GridQueue::ResetDrain or - * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that - * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size - * is simply the number of elements in the array.) - * - * \par - * Iterative work management can be implemented simply with a pair of flip-flopping - * work buffers, each with an associated set of fill and drain GridQueue descriptors. - * - * \tparam OffsetT Signed integer type for global offsets - */ -template -class GridQueue -{ -private: - - /// Counter indices - enum - { - FILL = 0, - DRAIN = 1, - }; - - /// Pair of counters - OffsetT *d_counters; - -public: - - /// Returns the device allocation size in bytes needed to construct a GridQueue instance - __host__ __device__ __forceinline__ - static size_t AllocationSize() - { - return sizeof(OffsetT) * 2; - } - - - /// Constructs an invalid GridQueue descriptor - __host__ __device__ __forceinline__ GridQueue() - : - d_counters(NULL) - {} - - - /// Constructs a GridQueue descriptor around the device storage allocation - __host__ __device__ __forceinline__ GridQueue( - void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize(). - : - d_counters((OffsetT*) d_storage) - {} - - - /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. - __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain( - OffsetT fill_size, - cudaStream_t stream = 0) - { -#if (CUB_PTX_ARCH > 0) - (void)stream; - d_counters[FILL] = fill_size; - d_counters[DRAIN] = 0; - return cudaSuccess; -#else - OffsetT counters[2]; - counters[FILL] = fill_size; - counters[DRAIN] = 0; - return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); -#endif - } - - - /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. - __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) - { -#if (CUB_PTX_ARCH > 0) - (void)stream; - d_counters[DRAIN] = 0; - return cudaSuccess; -#else - return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); -#endif - } - - - /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. - __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0) - { -#if (CUB_PTX_ARCH > 0) - (void)stream; - d_counters[FILL] = 0; - return cudaSuccess; -#else - return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); -#endif - } - - - /// Returns the fill-size established by the parent or by the previous kernel. - __host__ __device__ __forceinline__ cudaError_t FillSize( - OffsetT &fill_size, - cudaStream_t stream = 0) - { -#if (CUB_PTX_ARCH > 0) - (void)stream; - fill_size = d_counters[FILL]; - return cudaSuccess; -#else - return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); -#endif - } - - - /// Drain \p num_items from the queue. Returns offset from which to read items. To be called from CUDA kernel. - __device__ __forceinline__ OffsetT Drain(OffsetT num_items) - { - return atomicAdd(d_counters + DRAIN, num_items); - } - - - /// Fill \p num_items into the queue. Returns offset from which to write items. To be called from CUDA kernel. - __device__ __forceinline__ OffsetT Fill(OffsetT num_items) - { - return atomicAdd(d_counters + FILL, num_items); - } -}; - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/** - * Reset grid queue (call with 1 block of 1 thread) - */ -template -__global__ void FillAndResetDrainKernel( - GridQueue grid_queue, - OffsetT num_items) -{ - grid_queue.FillAndResetDrain(num_items); -} - - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** @} */ // end group GridModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/host/mutex.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/host/mutex.cuh deleted file mode 100644 index 0054f1f..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/host/mutex.cuh +++ /dev/null @@ -1,171 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Simple portable mutex - */ - - -#pragma once - -#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) - #include -#else - #if defined(_WIN32) || defined(_WIN64) - #include - - #define WIN32_LEAN_AND_MEAN - #define NOMINMAX - #include - #undef WIN32_LEAN_AND_MEAN - #undef NOMINMAX - - /** - * Compiler read/write barrier - */ - #pragma intrinsic(_ReadWriteBarrier) - - #endif -#endif - -#include "../util_namespace.cuh" - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * Simple portable mutex - * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms) - * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++) - */ -struct Mutex -{ -#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) - - std::mutex mtx; - - void Lock() - { - mtx.lock(); - } - - void Unlock() - { - mtx.unlock(); - } - - void TryLock() - { - mtx.try_lock(); - } - -#else //__cplusplus > 199711L - - #if defined(_MSC_VER) - - // Microsoft VC++ - typedef long Spinlock; - - #else - - // GNU g++ - typedef int Spinlock; - - /** - * Compiler read/write barrier - */ - __forceinline__ void _ReadWriteBarrier() - { - __sync_synchronize(); - } - - /** - * Atomic exchange - */ - __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) - { - // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier - _ReadWriteBarrier(); - return __sync_lock_test_and_set(Target, Value); - } - - /** - * Pause instruction to prevent excess processor bus usage - */ - __forceinline__ void YieldProcessor() - { - } - - #endif // defined(_MSC_VER) - - /// Lock member - volatile Spinlock lock; - - /** - * Constructor - */ - Mutex() : lock(0) {} - - /** - * Return when the specified spinlock has been acquired - */ - __forceinline__ void Lock() - { - while (1) - { - if (!_InterlockedExchange(&lock, 1)) return; - while (lock) YieldProcessor(); - } - } - - - /** - * Release the specified spinlock - */ - __forceinline__ void Unlock() - { - _ReadWriteBarrier(); - lock = 0; - } - -#endif // __cplusplus > 199711L - -}; - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/arg_index_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/arg_index_input_iterator.cuh deleted file mode 100644 index d3bce58..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/arg_index_input_iterator.cuh +++ /dev/null @@ -1,259 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -#include - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples). - * - * \par Overview - * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT. - * Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose - * \p key field is \p i and whose \p value field is itr[i]. - * - Can be used with any data type. - * - Can be constructed, manipulated, and exchanged within and between host and device - * functions. Wrapped host memory can only be dereferenced on the host, and wrapped - * device memory can only be dereferenced on the device. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto - * dereference an array of doubles - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize a device array - * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] - * - * // Create an iterator wrapper - * cub::ArgIndexInputIterator itr(d_in); - * - * // Within device code: - * typedef typename cub::ArgIndexInputIterator::value_type Tuple; - * Tuple item_offset_pair.key = *itr; - * printf("%f @ %d\n", - * item_offset_pair.value, - * item_offset_pair.key); // 8.0 @ 0 - * - * itr = itr + 6; - * item_offset_pair.key = *itr; - * printf("%f @ %d\n", - * item_offset_pair.value, - * item_offset_pair.key); // 9.0 @ 6 - * - * \endcode - * - * \tparam InputIteratorT The value type of the wrapped input iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - * \tparam OutputValueT The paired value type of the tuple (Default: value type of input iterator) - */ -template < - typename InputIteratorT, - typename OffsetT = ptrdiff_t, - typename OutputValueT = typename std::iterator_traits::value_type> -class ArgIndexInputIterator -{ -public: - - // Required iterator traits - typedef ArgIndexInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef KeyValuePair value_type; ///< The type of the element the iterator can point to - typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to - typedef value_type reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::any_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - InputIteratorT itr; - difference_type offset; - -public: - - /// Constructor - __host__ __device__ __forceinline__ ArgIndexInputIterator( - InputIteratorT itr, ///< Input iterator to wrap - difference_type offset = 0) ///< OffsetT (in items) from \p itr denoting the position of the iterator - : - itr(itr), - offset(offset) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - offset++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - offset++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { - value_type retval; - retval.value = itr[offset]; - retval.key = offset; - return retval; - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(itr, offset + n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - offset += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(itr, offset - n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - offset -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return offset - other.offset; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - self_type offset = (*this) + n; - return *offset; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &(*(*this)); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return ((itr == rhs.itr) && (offset == rhs.offset)); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return ((itr != rhs.itr) || (offset != rhs.offset)); - } - - /// Normalize - __host__ __device__ __forceinline__ void normalize() - { - itr += offset; - offset = 0; - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) - { - return os; - } -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_input_iterator.cuh deleted file mode 100644 index 0c0252c..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_input_iterator.cuh +++ /dev/null @@ -1,240 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. - * - * \par Overview - * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native - * device pointer of type ValueType*. \p ValueType references are - * made by reading \p ValueType values through loads modified by \p MODIFIER. - * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", - * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). - * - Can be constructed, manipulated, and exchanged within and between host and device - * functions, but can only be dereferenced within device functions. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto - * dereference a device array of double using the "ldg" PTX load modifier - * (i.e., load values through texture cache). - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize a device array - * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] - * - * // Create an iterator wrapper - * cub::CacheModifiedInputIterator itr(d_in); - * - * // Within device code: - * printf("%f\n", itr[0]); // 8.0 - * printf("%f\n", itr[1]); // 6.0 - * printf("%f\n", itr[6]); // 9.0 - * - * \endcode - * - * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - CacheLoadModifier MODIFIER, - typename ValueType, - typename OffsetT = ptrdiff_t> -class CacheModifiedInputIterator -{ -public: - - // Required iterator traits - typedef CacheModifiedInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::device_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - - -public: - - /// Wrapped native pointer - ValueType* ptr; - - /// Constructor - template - __host__ __device__ __forceinline__ CacheModifiedInputIterator( - QualifiedValueType* ptr) ///< Native pointer to wrap - : - ptr(const_cast::Type *>(ptr)) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - ptr++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - ptr++; - return *this; - } - - /// Indirection - __device__ __forceinline__ reference operator*() const - { - return ThreadLoad(ptr); - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(ptr + n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - ptr += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(ptr - n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - ptr -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return ptr - other.ptr; - } - - /// Array subscript - template - __device__ __forceinline__ reference operator[](Distance n) const - { - return ThreadLoad(ptr + n); - } - - /// Structure dereference - __device__ __forceinline__ pointer operator->() - { - return &ThreadLoad(ptr); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (ptr == rhs.ptr); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (ptr != rhs.ptr); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) - { - return os; - } -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_output_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_output_iterator.cuh deleted file mode 100644 index 8dbaafa..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/cache_modified_output_iterator.cuh +++ /dev/null @@ -1,254 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A random-access output wrapper for storing array values using a PTX cache-modifier. - * - * \par Overview - * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native - * device pointer of type ValueType*. \p ValueType references are - * made by writing \p ValueType values through stores modified by \p MODIFIER. - * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB", - * "STORE_CG", "STORE_CS", "STORE_WT", etc.). - * - Can be constructed, manipulated, and exchanged within and between host and device - * functions, but can only be dereferenced within device functions. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to - * dereference a device array of doubles using the "wt" PTX load modifier - * (i.e., write-through to system memory). - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize a device array - * double *d_out; // e.g., [, , , , , , ] - * - * // Create an iterator wrapper - * cub::CacheModifiedOutputIterator itr(d_out); - * - * // Within device code: - * itr[0] = 8.0; - * itr[1] = 66.0; - * itr[55] = 24.0; - * - * \endcode - * - * \par Usage Considerations - * - Can only be dereferenced within device code - * - * \tparam CacheStoreModifier The cub::CacheStoreModifier to use when accessing data - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - CacheStoreModifier MODIFIER, - typename ValueType, - typename OffsetT = ptrdiff_t> -class CacheModifiedOutputIterator -{ -private: - - // Proxy object - struct Reference - { - ValueType* ptr; - - /// Constructor - __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {} - - /// Assignment - __device__ __forceinline__ ValueType operator =(ValueType val) - { - ThreadStore(ptr, val); - return val; - } - }; - -public: - - // Required iterator traits - typedef CacheModifiedOutputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef void value_type; ///< The type of the element the iterator can point to - typedef void pointer; ///< The type of a pointer to an element the iterator can point to - typedef Reference reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::device_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - ValueType* ptr; - -public: - - /// Constructor - template - __host__ __device__ __forceinline__ CacheModifiedOutputIterator( - QualifiedValueType* ptr) ///< Native pointer to wrap - : - ptr(const_cast::Type *>(ptr)) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - ptr++; - return retval; - } - - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - ptr++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { - return Reference(ptr); - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(ptr + n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - ptr += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(ptr - n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - ptr -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return ptr - other.ptr; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - return Reference(ptr + n); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (ptr == rhs.ptr); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (ptr != rhs.ptr); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - return os; - } -}; - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/constant_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/constant_input_iterator.cuh deleted file mode 100644 index 0b7af47..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/constant_input_iterator.cuh +++ /dev/null @@ -1,235 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A random-access input generator for dereferencing a sequence of homogeneous values - * - * \par Overview - * - Read references to a ConstantInputIteratorTiterator always return the supplied constant - * of type \p ValueType. - * - Can be used with any data type. - * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device - * functions. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p ConstantInputIteratorTto - * dereference a sequence of homogeneous doubles. - * \par - * \code - * #include // or equivalently - * - * cub::ConstantInputIterator itr(5.0); - * - * printf("%f\n", itr[0]); // 5.0 - * printf("%f\n", itr[1]); // 5.0 - * printf("%f\n", itr[2]); // 5.0 - * printf("%f\n", itr[50]); // 5.0 - * - * \endcode - * - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - typename ValueType, - typename OffsetT = ptrdiff_t> -class ConstantInputIterator -{ -public: - - // Required iterator traits - typedef ConstantInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::any_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - ValueType val; - OffsetT offset; -#ifdef _WIN32 - OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) -#endif - -public: - - /// Constructor - __host__ __device__ __forceinline__ ConstantInputIterator( - ValueType val, ///< Starting value for the iterator instance to report - OffsetT offset = 0) ///< Base offset - : - val(val), - offset(offset) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - offset++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - offset++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { - return val; - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(val, offset + n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - offset += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(val, offset - n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - offset -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return offset - other.offset; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const - { - return val; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &val; - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (offset == rhs.offset) && ((val == rhs.val)); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (offset != rhs.offset) || (val!= rhs.val); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - os << "[" << itr.val << "," << itr.offset << "]"; - return os; - } - -}; - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/counting_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/counting_input_iterator.cuh deleted file mode 100644 index 3b42a00..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/counting_input_iterator.cuh +++ /dev/null @@ -1,228 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIterator - * @{ - */ - -/** - * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. - * - * \par Overview - * - After initializing a CountingInputIteratorTto a certain integer \p base, read references - * at \p offset will return the value \p base + \p offset. - * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device - * functions. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p CountingInputIteratorTto - * dereference a sequence of incrementing integers. - * \par - * \code - * #include // or equivalently - * - * cub::CountingInputIterator itr(5); - * - * printf("%d\n", itr[0]); // 5 - * printf("%d\n", itr[1]); // 6 - * printf("%d\n", itr[2]); // 7 - * printf("%d\n", itr[50]); // 55 - * - * \endcode - * - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - typename ValueType, - typename OffsetT = ptrdiff_t> -class CountingInputIterator -{ -public: - - // Required iterator traits - typedef CountingInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::any_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - ValueType val; - -public: - - /// Constructor - __host__ __device__ __forceinline__ CountingInputIterator( - const ValueType &val) ///< Starting value for the iterator instance to report - : - val(val) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - val++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - val++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { - return val; - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(val + (ValueType) n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - val += (ValueType) n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(val - (ValueType) n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - val -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return (difference_type) (val - other.val); - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - return val + (ValueType) n; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &val; - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (val == rhs.val); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (val != rhs.val); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - os << "[" << itr.val << "]"; - return os; - } - -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/discard_output_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/discard_output_iterator.cuh deleted file mode 100644 index 1fca08c..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/discard_output_iterator.cuh +++ /dev/null @@ -1,220 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../util_namespace.cuh" -#include "../util_macro.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A discard iterator - */ -template -class DiscardOutputIterator -{ -public: - - // Required iterator traits - typedef DiscardOutputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef void value_type; ///< The type of the element the iterator can point to - typedef void pointer; ///< The type of a pointer to an element the iterator can point to - typedef void reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::any_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - OffsetT offset; - -#if defined(_WIN32) || !defined(_WIN64) - // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) - OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; -#endif - -public: - - /// Constructor - __host__ __device__ __forceinline__ DiscardOutputIterator( - OffsetT offset = 0) ///< Base offset - : - offset(offset) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - offset++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - offset++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ self_type& operator*() - { - // return self reference, which can be assigned to anything - return *this; - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(offset + n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - offset += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(offset - n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - offset -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return offset - other.offset; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ self_type& operator[](Distance n) - { - // return self reference, which can be assigned to anything - return *this; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return; - } - - /// Assignment to self (no-op) - __host__ __device__ __forceinline__ void operator=(self_type const& other) - { - offset = other.offset; - } - - /// Assignment to anything else (no-op) - template - __host__ __device__ __forceinline__ void operator=(T const&) - {} - - /// Cast to void* operator - __host__ __device__ __forceinline__ operator void*() const { return NULL; } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (offset == rhs.offset); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (offset != rhs.offset); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - os << "[" << itr.offset << "]"; - return os; - } - -}; - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_obj_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_obj_input_iterator.cuh deleted file mode 100644 index 6236094..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_obj_input_iterator.cuh +++ /dev/null @@ -1,310 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_debug.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIterator - * @{ - */ - - - -/** - * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects. - * - * \par Overview - * - TexObjInputIteratorTwraps a native device pointer of type ValueType*. References - * to elements are to be loaded through texture cache. - * - Can be used to load any data type from memory through texture cache. - * - Can be manipulated and exchanged within and between host and device - * functions, can only be constructed within host functions, and can only be - * dereferenced within device functions. - * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be - * created by the host thread, but can be used by any descendant kernel. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p TexRefInputIteratorTto - * dereference a device array of doubles through texture cache. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize a device array - * int num_items; // e.g., 7 - * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] - * - * // Create an iterator wrapper - * cub::TexObjInputIterator itr; - * itr.BindTexture(d_in, sizeof(double) * num_items); - * ... - * - * // Within device code: - * printf("%f\n", itr[0]); // 8.0 - * printf("%f\n", itr[1]); // 6.0 - * printf("%f\n", itr[6]); // 9.0 - * - * ... - * itr.UnbindTexture(); - * - * \endcode - * - * \tparam T The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - typename T, - typename OffsetT = ptrdiff_t> -class TexObjInputIterator -{ -public: - - // Required iterator traits - typedef TexObjInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef T value_type; ///< The type of the element the iterator can point to - typedef T* pointer; ///< The type of a pointer to an element the iterator can point to - typedef T reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::device_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - // Largest texture word we can use in device - typedef typename UnitWord::TextureWord TextureWord; - - // Number of texture words per T - enum { - TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) - }; - -private: - - T* ptr; - difference_type tex_offset; - cudaTextureObject_t tex_obj; - -public: - - /// Constructor - __host__ __device__ __forceinline__ TexObjInputIterator() - : - ptr(NULL), - tex_offset(0), - tex_obj(0) - {} - - /// Use this iterator to bind \p ptr with a texture reference - template - cudaError_t BindTexture( - QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment - size_t bytes = size_t(-1), ///< Number of bytes in the range - size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator - { - this->ptr = const_cast::Type *>(ptr); - this->tex_offset = tex_offset; - - cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); - cudaResourceDesc res_desc; - cudaTextureDesc tex_desc; - memset(&res_desc, 0, sizeof(cudaResourceDesc)); - memset(&tex_desc, 0, sizeof(cudaTextureDesc)); - res_desc.resType = cudaResourceTypeLinear; - res_desc.res.linear.devPtr = this->ptr; - res_desc.res.linear.desc = channel_desc; - res_desc.res.linear.sizeInBytes = bytes; - tex_desc.readMode = cudaReadModeElementType; - return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL); - } - - /// Unbind this iterator from its texture reference - cudaError_t UnbindTexture() - { - return cudaDestroyTextureObject(tex_obj); - } - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - tex_offset++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - tex_offset++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { -#if (CUB_PTX_ARCH == 0) - // Simply dereference the pointer on the host - return ptr[tex_offset]; -#else - // Move array of uninitialized words, then alias and assign to return value - TextureWord words[TEXTURE_MULTIPLE]; - - #pragma unroll - for (int i = 0; i < TEXTURE_MULTIPLE; ++i) - { - words[i] = tex1Dfetch( - tex_obj, - (tex_offset * TEXTURE_MULTIPLE) + i); - } - - // Load from words - return *reinterpret_cast(words); -#endif - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval; - retval.ptr = ptr; - retval.tex_obj = tex_obj; - retval.tex_offset = tex_offset + n; - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - tex_offset += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval; - retval.ptr = ptr; - retval.tex_obj = tex_obj; - retval.tex_offset = tex_offset - n; - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - tex_offset -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return tex_offset - other.tex_offset; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - self_type offset = (*this) + n; - return *offset; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &(*(*this)); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj)); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj)); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - return os; - } - -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_ref_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_ref_input_iterator.cuh deleted file mode 100644 index da1fd16..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/tex_ref_input_iterator.cuh +++ /dev/null @@ -1,374 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_debug.cuh" -#include "../util_namespace.cuh" - -#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE) // This iterator is compatible with CUDA 5.5 and newer - -#if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Static file-scope Tesla/Fermi-style texture references - *****************************************************************************/ - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -// Anonymous namespace -namespace { - -/// Global texture reference specialized by type -template -struct IteratorTexRef -{ - /// And by unique ID - template - struct TexId - { - // Largest texture word we can use in device - typedef typename UnitWord::DeviceWord DeviceWord; - typedef typename UnitWord::TextureWord TextureWord; - - // Number of texture words per T - enum { - DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord), - TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) - }; - - // Texture reference type - typedef texture TexRef; - - // Texture reference - static TexRef ref; - - /// Bind texture - static cudaError_t BindTexture(void *d_in, size_t &offset) - { - if (d_in) - { - cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); - ref.channelDesc = tex_desc; - return (CubDebug(cudaBindTexture(&offset, ref, d_in))); - } - - return cudaSuccess; - } - - /// Unbind texture - static cudaError_t UnbindTexture() - { - return CubDebug(cudaUnbindTexture(ref)); - } - - /// Fetch element - template - static __device__ __forceinline__ T Fetch(Distance tex_offset) - { - DeviceWord temp[DEVICE_MULTIPLE]; - TextureWord *words = reinterpret_cast(temp); - - #pragma unroll - for (int i = 0; i < TEXTURE_MULTIPLE; ++i) - { - words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i); - } - - return reinterpret_cast(temp); - } - }; -}; - -// Texture reference definitions -template -template -typename IteratorTexRef::template TexId::TexRef IteratorTexRef::template TexId::ref = 0; - - -} // Anonymous namespace - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/** - * \addtogroup UtilIterator - * @{ - */ - - - -/** - * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses older Tesla/Fermi-style texture references. - * - * \par Overview - * - TexRefInputIteratorTwraps a native device pointer of type ValueType*. References - * to elements are to be loaded through texture cache. - * - Can be used to load any data type from memory through texture cache. - * - Can be manipulated and exchanged within and between host and device - * functions, can only be constructed within host functions, and can only be - * dereferenced within device functions. - * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture - * reference. Only one TexRefInputIteratorTinstance can be bound at any given time for a - * specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host - * thread, and (4) compilation .o unit. - * - With regard to nested/dynamic parallelism, TexRefInputIteratorTiterators may only be - * created by the host thread and used by a top-level kernel (i.e. the one which is launched - * from the host). - * - Compatible with Thrust API v1.7 or newer. - * - Compatible with CUDA toolkit v5.5 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p TexRefInputIteratorTto - * dereference a device array of doubles through texture cache. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize a device array - * int num_items; // e.g., 7 - * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] - * - * // Create an iterator wrapper - * cub::TexRefInputIterator itr; - * itr.BindTexture(d_in, sizeof(double) * num_items); - * ... - * - * // Within device code: - * printf("%f\n", itr[0]); // 8.0 - * printf("%f\n", itr[1]); // 6.0 - * printf("%f\n", itr[6]); // 9.0 - * - * ... - * itr.UnbindTexture(); - * - * \endcode - * - * \tparam T The value type of this iterator - * \tparam UNIQUE_ID A globally-unique identifier (within the compilation unit) to name the underlying texture reference - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - typename T, - int UNIQUE_ID, - typename OffsetT = ptrdiff_t> -class TexRefInputIterator -{ -public: - - // Required iterator traits - typedef TexRefInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef T value_type; ///< The type of the element the iterator can point to - typedef T* pointer; ///< The type of a pointer to an element the iterator can point to - typedef T reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::device_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - T* ptr; - difference_type tex_offset; - - // Texture reference wrapper (old Tesla/Fermi-style textures) - typedef typename IteratorTexRef::template TexId TexId; - -public: -/* - /// Constructor - __host__ __device__ __forceinline__ TexRefInputIterator() - : - ptr(NULL), - tex_offset(0) - {} -*/ - /// Use this iterator to bind \p ptr with a texture reference - template - cudaError_t BindTexture( - QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment - size_t bytes = size_t(-1), ///< Number of bytes in the range - size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator - { - this->ptr = const_cast::Type *>(ptr); - size_t offset; - cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset); - this->tex_offset = (difference_type) (offset / sizeof(QualifiedT)); - return retval; - } - - /// Unbind this iterator from its texture reference - cudaError_t UnbindTexture() - { - return TexId::UnbindTexture(); - } - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - tex_offset++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - tex_offset++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { -#if (CUB_PTX_ARCH == 0) - // Simply dereference the pointer on the host - return ptr[tex_offset]; -#else - // Use the texture reference - return TexId::Fetch(tex_offset); -#endif - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval; - retval.ptr = ptr; - retval.tex_offset = tex_offset + n; - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - tex_offset += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval; - retval.ptr = ptr; - retval.tex_offset = tex_offset - n; - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - tex_offset -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return tex_offset - other.tex_offset; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - self_type offset = (*this) + n; - return *offset; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &(*(*this)); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset)); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset)); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - return os; - } - -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - -#endif // CUDA_VERSION diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/transform_input_iterator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/transform_input_iterator.cuh deleted file mode 100644 index 39258a4..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/iterator/transform_input_iterator.cuh +++ /dev/null @@ -1,252 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A random-access input wrapper for transforming dereferenced values. - * - * \par Overview - * - TransformInputIteratorTwraps a unary conversion functor of type \p - * ConversionOp and a random-access input iterator of type InputIteratorT, - * using the former to produce references of type \p ValueType from the latter. - * - Can be used with any data type. - * - Can be constructed, manipulated, and exchanged within and between host and device - * functions. Wrapped host memory can only be dereferenced on the host, and wrapped - * device memory can only be dereferenced on the device. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p TransformInputIteratorTto - * dereference an array of integers, tripling the values and converting them to doubles. - * \par - * \code - * #include // or equivalently - * - * // Functor for tripling integer values and converting to doubles - * struct TripleDoubler - * { - * __host__ __device__ __forceinline__ - * double operator()(const int &a) const { - * return double(a * 3); - * } - * }; - * - * // Declare, allocate, and initialize a device array - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * TripleDoubler conversion_op; - * - * // Create an iterator wrapper - * cub::TransformInputIterator itr(d_in, conversion_op); - * - * // Within device code: - * printf("%f\n", itr[0]); // 24.0 - * printf("%f\n", itr[1]); // 18.0 - * printf("%f\n", itr[6]); // 27.0 - * - * \endcode - * - * \tparam ValueType The value type of this iterator - * \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p ValueType. Must have member ValueType operator()(const InputType &datum). - * \tparam InputIteratorT The type of the wrapped input iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - * - */ -template < - typename ValueType, - typename ConversionOp, - typename InputIteratorT, - typename OffsetT = ptrdiff_t> -class TransformInputIterator -{ -public: - - // Required iterator traits - typedef TransformInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::any_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - ConversionOp conversion_op; - InputIteratorT input_itr; - -public: - - /// Constructor - __host__ __device__ __forceinline__ TransformInputIterator( - InputIteratorT input_itr, ///< Input iterator to wrap - ConversionOp conversion_op) ///< Conversion functor to wrap - : - conversion_op(conversion_op), - input_itr(input_itr) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - input_itr++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - input_itr++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { - return conversion_op(*input_itr); - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(input_itr + n, conversion_op); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - input_itr += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(input_itr - n, conversion_op); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - input_itr -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return input_itr - other.input_itr; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - return conversion_op(input_itr[n]); - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &conversion_op(*input_itr); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (input_itr == rhs.input_itr); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (input_itr != rhs.input_itr); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - return os; - } -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_load.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_load.cuh deleted file mode 100644 index 9de4bd4..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_load.cuh +++ /dev/null @@ -1,438 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Thread utilities for reading memory using PTX cache modifiers. - */ - -#pragma once - -#include - -#include - -#include "../util_ptx.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIo - * @{ - */ - -//----------------------------------------------------------------------------- -// Tags and constants -//----------------------------------------------------------------------------- - -/** - * \brief Enumeration of cache modifiers for memory load operations. - */ -enum CacheLoadModifier -{ - LOAD_DEFAULT, ///< Default (no modifier) - LOAD_CA, ///< Cache at all levels - LOAD_CG, ///< Cache at global level - LOAD_CS, ///< Cache streaming (likely to be accessed once) - LOAD_CV, ///< Cache as volatile (including cached system lines) - LOAD_LDG, ///< Cache as texture - LOAD_VOLATILE, ///< Volatile (any memory space) -}; - - -/** - * \name Thread I/O (cache modified) - * @{ - */ - -/** - * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. - * - * \par Example - * \code - * #include // or equivalently - * - * // 32-bit load using cache-global modifier: - * int *d_in; - * int val = cub::ThreadLoad(d_in + threadIdx.x); - * - * // 16-bit load using default modifier - * short *d_in; - * short val = cub::ThreadLoad(d_in + threadIdx.x); - * - * // 256-bit load using cache-volatile modifier - * double4 *d_in; - * double4 val = cub::ThreadLoad(d_in + threadIdx.x); - * - * // 96-bit load using cache-streaming modifier - * struct TestFoo { bool a; short b; }; - * TestFoo *d_struct; - * TestFoo val = cub::ThreadLoad(d_in + threadIdx.x); - * \endcode - * - * \tparam MODIFIER [inferred] CacheLoadModifier enumeration - * \tparam InputIteratorT [inferred] Input iterator type \iterator - */ -template < - CacheLoadModifier MODIFIER, - typename InputIteratorT> -__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr); - - -//@} end member group - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/// Helper structure for templated load iteration (inductive case) -template -struct IterateThreadLoad -{ - template - static __device__ __forceinline__ void Load(T const *ptr, T *vals) - { - vals[COUNT] = ThreadLoad(ptr + COUNT); - IterateThreadLoad::template Load(ptr, vals); - } - - template - static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals) - { - vals[COUNT] = itr[COUNT]; - IterateThreadLoad::Dereference(itr, vals); - } -}; - - -/// Helper structure for templated load iteration (termination case) -template -struct IterateThreadLoad -{ - template - static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {} - - template - static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {} -}; - - -/** - * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier - */ -#define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ uint4 ThreadLoad(uint4 const *ptr) \ - { \ - uint4 retval; \ - asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ - "=r"(retval.x), \ - "=r"(retval.y), \ - "=r"(retval.z), \ - "=r"(retval.w) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } \ - template<> \ - __device__ __forceinline__ ulonglong2 ThreadLoad(ulonglong2 const *ptr) \ - { \ - ulonglong2 retval; \ - asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ - "=l"(retval.x), \ - "=l"(retval.y) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } - -/** - * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier - */ -#define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ ushort4 ThreadLoad(ushort4 const *ptr) \ - { \ - ushort4 retval; \ - asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ - "=h"(retval.x), \ - "=h"(retval.y), \ - "=h"(retval.z), \ - "=h"(retval.w) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } \ - template<> \ - __device__ __forceinline__ uint2 ThreadLoad(uint2 const *ptr) \ - { \ - uint2 retval; \ - asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ - "=r"(retval.x), \ - "=r"(retval.y) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } \ - template<> \ - __device__ __forceinline__ unsigned long long ThreadLoad(unsigned long long const *ptr) \ - { \ - unsigned long long retval; \ - asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ - "=l"(retval) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } - -/** - * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier - */ -#define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ unsigned int ThreadLoad(unsigned int const *ptr) \ - { \ - unsigned int retval; \ - asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ - "=r"(retval) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } - - -/** - * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier - */ -#define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ unsigned short ThreadLoad(unsigned short const *ptr) \ - { \ - unsigned short retval; \ - asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ - "=h"(retval) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } - - -/** - * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier - */ -#define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ unsigned char ThreadLoad(unsigned char const *ptr) \ - { \ - unsigned short retval; \ - asm volatile ( \ - "{" \ - " .reg .u8 datum;" \ - " ld."#ptx_modifier".u8 datum, [%1];" \ - " cvt.u16.u8 %0, datum;" \ - "}" : \ - "=h"(retval) : \ - _CUB_ASM_PTR_(ptr)); \ - return (unsigned char) retval; \ - } - - -/** - * Define powers-of-two ThreadLoad specializations for the given Cache load modifier - */ -#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ - _CUB_LOAD_16(cub_modifier, ptx_modifier) \ - _CUB_LOAD_8(cub_modifier, ptx_modifier) \ - _CUB_LOAD_4(cub_modifier, ptx_modifier) \ - _CUB_LOAD_2(cub_modifier, ptx_modifier) \ - _CUB_LOAD_1(cub_modifier, ptx_modifier) \ - - -/** - * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers - */ -#if CUB_PTX_ARCH >= 200 - _CUB_LOAD_ALL(LOAD_CA, ca) - _CUB_LOAD_ALL(LOAD_CG, cg) - _CUB_LOAD_ALL(LOAD_CS, cs) - _CUB_LOAD_ALL(LOAD_CV, cv) -#else - _CUB_LOAD_ALL(LOAD_CA, global) - // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 - _CUB_LOAD_ALL(LOAD_CG, volatile.global) - _CUB_LOAD_ALL(LOAD_CS, global) - _CUB_LOAD_ALL(LOAD_CV, volatile.global) -#endif - -#if CUB_PTX_ARCH >= 350 - _CUB_LOAD_ALL(LOAD_LDG, global.nc) -#else - _CUB_LOAD_ALL(LOAD_LDG, global) -#endif - - -// Macro cleanup -#undef _CUB_LOAD_ALL -#undef _CUB_LOAD_1 -#undef _CUB_LOAD_2 -#undef _CUB_LOAD_4 -#undef _CUB_LOAD_8 -#undef _CUB_LOAD_16 - - - -/** - * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types - */ -template -__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad( - InputIteratorT itr, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - return *itr; -} - - -/** - * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types - */ -template -__device__ __forceinline__ T ThreadLoad( - T *ptr, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - return *ptr; -} - - -/** - * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types - */ -template -__device__ __forceinline__ T ThreadLoadVolatilePointer( - T *ptr, - Int2Type /*is_primitive*/) -{ - T retval = *reinterpret_cast(ptr); - return retval; -} - - -/** - * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types - */ -template -__device__ __forceinline__ T ThreadLoadVolatilePointer( - T *ptr, - Int2Type /*is_primitive*/) -{ - typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying - - const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); -/* - VolatileWord words[VOLATILE_MULTIPLE]; - - IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( - reinterpret_cast(ptr), - words); - - return *reinterpret_cast(words); -*/ - - T retval; - VolatileWord *words = reinterpret_cast(&retval); - IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( - reinterpret_cast(ptr), - words); - return retval; -} - - -/** - * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types - */ -template -__device__ __forceinline__ T ThreadLoad( - T *ptr, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - // Apply tags for partial-specialization - return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); -} - - -/** - * ThreadLoad definition for generic modifiers on pointer types - */ -template -__device__ __forceinline__ T ThreadLoad( - T const *ptr, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - typedef typename UnitWord::DeviceWord DeviceWord; - - const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); - - DeviceWord words[DEVICE_MULTIPLE]; - - IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load( - reinterpret_cast(const_cast(ptr)), - words); - - return *reinterpret_cast(words); -} - - -/** - * ThreadLoad definition for generic modifiers - */ -template < - CacheLoadModifier MODIFIER, - typename InputIteratorT> -__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr) -{ - // Apply tags for partial-specialization - return ThreadLoad( - itr, - Int2Type(), - Int2Type::VALUE>()); -} - - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** @} */ // end group UtilIo - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_operators.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_operators.cuh deleted file mode 100644 index 2bd5403..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_operators.cuh +++ /dev/null @@ -1,317 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Simple binary operator functor types - */ - -/****************************************************************************** - * Simple functor operators - ******************************************************************************/ - -#pragma once - -#include "../util_macro.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilModule - * @{ - */ - -/** - * \brief Default equality functor - */ -struct Equality -{ - /// Boolean equality operator, returns (a == b) - template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const - { - return a == b; - } -}; - - -/** - * \brief Default inequality functor - */ -struct Inequality -{ - /// Boolean inequality operator, returns (a != b) - template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const - { - return a != b; - } -}; - - -/** - * \brief Inequality functor (wraps equality functor) - */ -template -struct InequalityWrapper -{ - /// Wrapped equality operator - EqualityOp op; - - /// Constructor - __host__ __device__ __forceinline__ - InequalityWrapper(EqualityOp op) : op(op) {} - - /// Boolean inequality operator, returns (a != b) - template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) - { - return !op(a, b); - } -}; - - -/** - * \brief Default sum functor - */ -struct Sum -{ - /// Boolean sum operator, returns a + b - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return a + b; - } -}; - - -/** - * \brief Default max functor - */ -struct Max -{ - /// Boolean max operator, returns (a > b) ? a : b - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return CUB_MAX(a, b); - } -}; - - -/** - * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item) - */ -struct ArgMax -{ - /// Boolean max operator, preferring the item having the smaller offset in case of ties - template - __host__ __device__ __forceinline__ KeyValuePair operator()( - const KeyValuePair &a, - const KeyValuePair &b) const - { -// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) -// return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; - - if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) - return b; - return a; - } -}; - - -/** - * \brief Default min functor - */ -struct Min -{ - /// Boolean min operator, returns (a < b) ? a : b - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return CUB_MIN(a, b); - } -}; - - -/** - * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item) - */ -struct ArgMin -{ - /// Boolean min operator, preferring the item having the smaller offset in case of ties - template - __host__ __device__ __forceinline__ KeyValuePair operator()( - const KeyValuePair &a, - const KeyValuePair &b) const - { -// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) -// return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; - - if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) - return b; - return a; - } -}; - - -/** - * \brief Default cast functor - */ -template -struct CastOp -{ - /// Cast operator, returns (B) a - template - __host__ __device__ __forceinline__ B operator()(const A &a) const - { - return (B) a; - } -}; - - -/** - * \brief Binary operator wrapper for switching non-commutative scan arguments - */ -template -class SwizzleScanOp -{ -private: - - /// Wrapped scan operator - ScanOp scan_op; - -public: - - /// Constructor - __host__ __device__ __forceinline__ - SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} - - /// Switch the scan arguments - template - __host__ __device__ __forceinline__ - T operator()(const T &a, const T &b) - { - T _a(a); - T _b(b); - - return scan_op(_b, _a); - } -}; - - -/** - * \brief Reduce-by-segment functor. - * - * Given two cub::KeyValuePair inputs \p a and \p b and a - * binary associative combining operator \p f(const T &x, const T &y), - * an instance of this functor returns a cub::KeyValuePair whose \p key - * field is a.key + b.key, and whose \p value field - * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise. - * - * ReduceBySegmentOp is an associative, non-commutative binary combining operator - * for input sequences of cub::KeyValuePair pairings. Such - * sequences are typically used to represent a segmented set of values to be reduced - * and a corresponding set of {0,1}-valued integer "head flags" demarcating the - * first value of each segment. - * - */ -template ///< Binary reduction operator to apply to values -struct ReduceBySegmentOp -{ - /// Wrapped reduction operator - ReductionOpT op; - - /// Constructor - __host__ __device__ __forceinline__ ReduceBySegmentOp() {} - - /// Constructor - __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {} - - /// Scan operator - template ///< KeyValuePair pairing of T (value) and OffsetT (head flag) - __host__ __device__ __forceinline__ KeyValuePairT operator()( - const KeyValuePairT &first, ///< First partial reduction - const KeyValuePairT &second) ///< Second partial reduction - { - KeyValuePairT retval; - retval.key = first.key + second.key; - retval.value = (second.key) ? - second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate - op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate - return retval; - } -}; - - - -template ///< Binary reduction operator to apply to values -struct ReduceByKeyOp -{ - /// Wrapped reduction operator - ReductionOpT op; - - /// Constructor - __host__ __device__ __forceinline__ ReduceByKeyOp() {} - - /// Constructor - __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {} - - /// Scan operator - template - __host__ __device__ __forceinline__ KeyValuePairT operator()( - const KeyValuePairT &first, ///< First partial reduction - const KeyValuePairT &second) ///< Second partial reduction - { - KeyValuePairT retval = second; - - if (first.key == second.key) - retval.value = op(first.value, retval.value); - - return retval; - } -}; - - - - - - - -/** @} */ // end group UtilModule - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_reduce.cuh deleted file mode 100644 index 9e27705..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_reduce.cuh +++ /dev/null @@ -1,152 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Thread utilities for sequential reduction over statically-sized array types - */ - -#pragma once - -#include "../thread/thread_operators.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) -namespace internal { - -/** - * Sequential reduction over statically-sized array types - */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T* input, ///< [in] Input array - ReductionOp reduction_op, ///< [in] Binary reduction operator - T prefix, ///< [in] Prefix to seed reduction with - Int2Type /*length*/) -{ - T retval = prefix; - - #pragma unroll - for (int i = 0; i < LENGTH; ++i) - retval = reduction_op(retval, input[i]); - - return retval; -} - - -/** - * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH LengthT of input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T* input, ///< [in] Input array - ReductionOp reduction_op, ///< [in] Binary reduction operator - T prefix) ///< [in] Prefix to seed reduction with -{ - return ThreadReduce(input, reduction_op, prefix, Int2Type()); -} - - -/** - * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. - * - * \tparam LENGTH LengthT of input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T* input, ///< [in] Input array - ReductionOp reduction_op) ///< [in] Binary reduction operator -{ - T prefix = input[0]; - return ThreadReduce(input + 1, reduction_op, prefix); -} - - -/** - * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH [inferred] LengthT of \p input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T (&input)[LENGTH], ///< [in] Input array - ReductionOp reduction_op, ///< [in] Binary reduction operator - T prefix) ///< [in] Prefix to seed reduction with -{ - return ThreadReduce(input, reduction_op, prefix, Int2Type()); -} - - -/** - * \brief Serial reduction with the specified operator - * - * \tparam LENGTH [inferred] LengthT of \p input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T (&input)[LENGTH], ///< [in] Input array - ReductionOp reduction_op) ///< [in] Binary reduction operator -{ - return ThreadReduce((T*) input, reduction_op); -} - - -} // internal namespace -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_scan.cuh deleted file mode 100644 index 545b414..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_scan.cuh +++ /dev/null @@ -1,268 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Thread utilities for sequential prefix scan over statically-sized array types - */ - -#pragma once - -#include "../thread/thread_operators.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) -namespace internal { - - -/** - * \addtogroup UtilModule - * @{ - */ - -/** - * \name Sequential prefix scan over statically-sized array types - * @{ - */ - -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanExclusive( - T inclusive, - T exclusive, - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type /*length*/) -{ - #pragma unroll - for (int i = 0; i < LENGTH; ++i) - { - inclusive = scan_op(exclusive, input[i]); - output[i] = exclusive; - exclusive = inclusive; - } - - return inclusive; -} - - - -/** - * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanExclusive( - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.) -{ - T inclusive = input[0]; - if (apply_prefix) - { - inclusive = scan_op(prefix, inclusive); - } - output[0] = prefix; - T exclusive = inclusive; - - return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type()); -} - - -/** - * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanExclusive( - T (&input)[LENGTH], ///< [in] Input array - T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) -{ - return ThreadScanExclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); -} - - - - - - - - - -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T inclusive, - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type /*length*/) -{ - #pragma unroll - for (int i = 0; i < LENGTH; ++i) - { - inclusive = scan_op(inclusive, input[i]); - output[i] = inclusive; - } - - return inclusive; -} - - -/** - * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned. - * - * \tparam LENGTH LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator -{ - T inclusive = input[0]; - output[0] = inclusive; - - // Continue scan - return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); -} - - -/** - * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned. - * - * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T (&input)[LENGTH], ///< [in] Input array - T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator -{ - return ThreadScanInclusive((T*) input, (T*) output, scan_op); -} - - -/** - * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) -{ - T inclusive = input[0]; - if (apply_prefix) - { - inclusive = scan_op(prefix, inclusive); - } - output[0] = inclusive; - - // Continue scan - return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); -} - - -/** - * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T (&input)[LENGTH], ///< [in] Input array - T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) -{ - return ThreadScanInclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); -} - - -//@} end member group - -/** @} */ // end group UtilModule - - -} // internal namespace -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_search.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_search.cuh deleted file mode 100644 index 379a08a..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_search.cuh +++ /dev/null @@ -1,154 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Thread utilities for sequential search - */ - -#pragma once - -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * Computes the begin offsets into A and B for the specific diagonal - */ -template < - typename AIteratorT, - typename BIteratorT, - typename OffsetT, - typename CoordinateT> -__host__ __device__ __forceinline__ void MergePathSearch( - OffsetT diagonal, - AIteratorT a, - BIteratorT b, - OffsetT a_len, - OffsetT b_len, - CoordinateT& path_coordinate) -{ - /// The value type of the input iterator - typedef typename std::iterator_traits::value_type T; - - OffsetT split_min = CUB_MAX(diagonal - b_len, 0); - OffsetT split_max = CUB_MIN(diagonal, a_len); - - while (split_min < split_max) - { - OffsetT split_pivot = (split_min + split_max) >> 1; - if (a[split_pivot] <= b[diagonal - split_pivot - 1]) - { - // Move candidate split range up A, down B - split_min = split_pivot + 1; - } - else - { - // Move candidate split range up B, down A - split_max = split_pivot; - } - } - - path_coordinate.x = CUB_MIN(split_min, a_len); - path_coordinate.y = diagonal - split_min; -} - - - -/** - * \brief Returns the offset of the first value within \p input which does not compare less than \p val - */ -template < - typename InputIteratorT, - typename OffsetT, - typename T> -__device__ __forceinline__ OffsetT LowerBound( - InputIteratorT input, ///< [in] Input sequence - OffsetT num_items, ///< [in] Input sequence length - T val) ///< [in] Search key -{ - OffsetT retval = 0; - while (num_items > 0) - { - OffsetT half = num_items >> 1; - if (input[retval + half] < val) - { - retval = retval + (half + 1); - num_items = num_items - (half + 1); - } - else - { - num_items = half; - } - } - - return retval; -} - - -/** - * \brief Returns the offset of the first value within \p input which compares greater than \p val - */ -template < - typename InputIteratorT, - typename OffsetT, - typename T> -__device__ __forceinline__ OffsetT UpperBound( - InputIteratorT input, ///< [in] Input sequence - OffsetT num_items, ///< [in] Input sequence length - T val) ///< [in] Search key -{ - OffsetT retval = 0; - while (num_items > 0) - { - OffsetT half = num_items >> 1; - if (val < input[retval + half]) - { - num_items = half; - } - else - { - retval = retval + (half + 1); - num_items = num_items - (half + 1); - } - } - - return retval; -} - - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_store.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_store.cuh deleted file mode 100644 index 14ee84d..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/thread/thread_store.cuh +++ /dev/null @@ -1,422 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Thread utilities for writing memory using PTX cache modifiers. - */ - -#pragma once - -#include - -#include "../util_ptx.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIo - * @{ - */ - - -//----------------------------------------------------------------------------- -// Tags and constants -//----------------------------------------------------------------------------- - -/** - * \brief Enumeration of cache modifiers for memory store operations. - */ -enum CacheStoreModifier -{ - STORE_DEFAULT, ///< Default (no modifier) - STORE_WB, ///< Cache write-back all coherent levels - STORE_CG, ///< Cache at global level - STORE_CS, ///< Cache streaming (likely to be accessed once) - STORE_WT, ///< Cache write-through (to system memory) - STORE_VOLATILE, ///< Volatile shared (any memory space) -}; - - -/** - * \name Thread I/O (cache modified) - * @{ - */ - -/** - * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. - * - * \par Example - * \code - * #include // or equivalently - * - * // 32-bit store using cache-global modifier: - * int *d_out; - * int val; - * cub::ThreadStore(d_out + threadIdx.x, val); - * - * // 16-bit store using default modifier - * short *d_out; - * short val; - * cub::ThreadStore(d_out + threadIdx.x, val); - * - * // 256-bit store using write-through modifier - * double4 *d_out; - * double4 val; - * cub::ThreadStore(d_out + threadIdx.x, val); - * - * // 96-bit store using cache-streaming cache modifier - * struct TestFoo { bool a; short b; }; - * TestFoo *d_struct; - * TestFoo val; - * cub::ThreadStore(d_out + threadIdx.x, val); - * \endcode - * - * \tparam MODIFIER [inferred] CacheStoreModifier enumeration - * \tparam InputIteratorT [inferred] Output iterator type \iterator - * \tparam T [inferred] Data type of output value - */ -template < - CacheStoreModifier MODIFIER, - typename OutputIteratorT, - typename T> -__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val); - - -//@} end member group - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/// Helper structure for templated store iteration (inductive case) -template -struct IterateThreadStore -{ - template - static __device__ __forceinline__ void Store(T *ptr, T *vals) - { - ThreadStore(ptr + COUNT, vals[COUNT]); - IterateThreadStore::template Store(ptr, vals); - } - - template - static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals) - { - ptr[COUNT] = vals[COUNT]; - IterateThreadStore::Dereference(ptr, vals); - } - -}; - -/// Helper structure for templated store iteration (termination case) -template -struct IterateThreadStore -{ - template - static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {} - - template - static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {} -}; - - -/** - * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier - */ -#define _CUB_STORE_16(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ void ThreadStore(uint4* ptr, uint4 val) \ - { \ - asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \ - _CUB_ASM_PTR_(ptr), \ - "r"(val.x), \ - "r"(val.y), \ - "r"(val.z), \ - "r"(val.w)); \ - } \ - template<> \ - __device__ __forceinline__ void ThreadStore(ulonglong2* ptr, ulonglong2 val) \ - { \ - asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \ - _CUB_ASM_PTR_(ptr), \ - "l"(val.x), \ - "l"(val.y)); \ - } - - -/** - * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier - */ -#define _CUB_STORE_8(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ void ThreadStore(ushort4* ptr, ushort4 val) \ - { \ - asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \ - _CUB_ASM_PTR_(ptr), \ - "h"(val.x), \ - "h"(val.y), \ - "h"(val.z), \ - "h"(val.w)); \ - } \ - template<> \ - __device__ __forceinline__ void ThreadStore(uint2* ptr, uint2 val) \ - { \ - asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \ - _CUB_ASM_PTR_(ptr), \ - "r"(val.x), \ - "r"(val.y)); \ - } \ - template<> \ - __device__ __forceinline__ void ThreadStore(unsigned long long* ptr, unsigned long long val) \ - { \ - asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \ - _CUB_ASM_PTR_(ptr), \ - "l"(val)); \ - } - -/** - * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier - */ -#define _CUB_STORE_4(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ void ThreadStore(unsigned int* ptr, unsigned int val) \ - { \ - asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \ - _CUB_ASM_PTR_(ptr), \ - "r"(val)); \ - } - - -/** - * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier - */ -#define _CUB_STORE_2(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ void ThreadStore(unsigned short* ptr, unsigned short val) \ - { \ - asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \ - _CUB_ASM_PTR_(ptr), \ - "h"(val)); \ - } - - -/** - * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier - */ -#define _CUB_STORE_1(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ void ThreadStore(unsigned char* ptr, unsigned char val) \ - { \ - asm volatile ( \ - "{" \ - " .reg .u8 datum;" \ - " cvt.u8.u16 datum, %1;" \ - " st."#ptx_modifier".u8 [%0], datum;" \ - "}" : : \ - _CUB_ASM_PTR_(ptr), \ - "h"((unsigned short) val)); \ - } - -/** - * Define powers-of-two ThreadStore specializations for the given Cache load modifier - */ -#define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \ - _CUB_STORE_16(cub_modifier, ptx_modifier) \ - _CUB_STORE_8(cub_modifier, ptx_modifier) \ - _CUB_STORE_4(cub_modifier, ptx_modifier) \ - _CUB_STORE_2(cub_modifier, ptx_modifier) \ - _CUB_STORE_1(cub_modifier, ptx_modifier) \ - - -/** - * Define ThreadStore specializations for the various Cache load modifiers - */ -#if CUB_PTX_ARCH >= 200 - _CUB_STORE_ALL(STORE_WB, wb) - _CUB_STORE_ALL(STORE_CG, cg) - _CUB_STORE_ALL(STORE_CS, cs) - _CUB_STORE_ALL(STORE_WT, wt) -#else - _CUB_STORE_ALL(STORE_WB, global) - _CUB_STORE_ALL(STORE_CG, global) - _CUB_STORE_ALL(STORE_CS, global) - _CUB_STORE_ALL(STORE_WT, volatile.global) -#endif - - -// Macro cleanup -#undef _CUB_STORE_ALL -#undef _CUB_STORE_1 -#undef _CUB_STORE_2 -#undef _CUB_STORE_4 -#undef _CUB_STORE_8 -#undef _CUB_STORE_16 - - -/** - * ThreadStore definition for STORE_DEFAULT modifier on iterator types - */ -template -__device__ __forceinline__ void ThreadStore( - OutputIteratorT itr, - T val, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - *itr = val; -} - - -/** - * ThreadStore definition for STORE_DEFAULT modifier on pointer types - */ -template -__device__ __forceinline__ void ThreadStore( - T *ptr, - T val, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - *ptr = val; -} - - -/** - * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types - */ -template -__device__ __forceinline__ void ThreadStoreVolatilePtr( - T *ptr, - T val, - Int2Type /*is_primitive*/) -{ - *reinterpret_cast(ptr) = val; -} - - -/** - * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types - */ -template -__device__ __forceinline__ void ThreadStoreVolatilePtr( - T *ptr, - T val, - Int2Type /*is_primitive*/) -{ - // Create a temporary using shuffle-words, then store using volatile-words - typedef typename UnitWord::VolatileWord VolatileWord; - typedef typename UnitWord::ShuffleWord ShuffleWord; - - const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); - const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); - - VolatileWord words[VOLATILE_MULTIPLE]; - - #pragma unroll - for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) - reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; - - IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference( - reinterpret_cast(ptr), - words); -} - - -/** - * ThreadStore definition for STORE_VOLATILE modifier on pointer types - */ -template -__device__ __forceinline__ void ThreadStore( - T *ptr, - T val, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - ThreadStoreVolatilePtr(ptr, val, Int2Type::PRIMITIVE>()); -} - - -/** - * ThreadStore definition for generic modifiers on pointer types - */ -template -__device__ __forceinline__ void ThreadStore( - T *ptr, - T val, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - // Create a temporary using shuffle-words, then store using device-words - typedef typename UnitWord::DeviceWord DeviceWord; - typedef typename UnitWord::ShuffleWord ShuffleWord; - - const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); - const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); - - DeviceWord words[DEVICE_MULTIPLE]; - - #pragma unroll - for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) - reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; - - IterateThreadStore<0, DEVICE_MULTIPLE>::template Store( - reinterpret_cast(ptr), - words); -} - - -/** - * ThreadStore definition for generic modifiers - */ -template -__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val) -{ - ThreadStore( - itr, - val, - Int2Type(), - Int2Type::VALUE>()); -} - - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** @} */ // end group UtilIo - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_allocator.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_allocator.cuh deleted file mode 100644 index 24c7a79..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_allocator.cuh +++ /dev/null @@ -1,708 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * Simple caching allocator for device memory allocations. The allocator is - * thread-safe and capable of managing device allocations on multiple devices. - ******************************************************************************/ - -#pragma once - -#include "util_namespace.cuh" -#include "util_debug.cuh" - -#include -#include - -#include "host/mutex.cuh" -#include - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilMgmt - * @{ - */ - - -/****************************************************************************** - * CachingDeviceAllocator (host use) - ******************************************************************************/ - -/** - * \brief A simple caching allocator for device memory allocations. - * - * \par Overview - * The allocator is thread-safe and stream-safe and is capable of managing cached - * device allocations on multiple devices. It behaves as follows: - * - * \par - * - Allocations from the allocator are associated with an \p active_stream. Once freed, - * the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for - * reuse within other streams when all prior work submitted to \p active_stream has completed. - * - Allocations are categorized and cached by bin size. A new allocation request of - * a given size will only consider cached allocations within the corresponding bin. - * - Bin limits progress geometrically in accordance with the growth factor - * \p bin_growth provided during construction. Unused device allocations within - * a larger bin cache are not reused for allocation requests that categorize to - * smaller bin sizes. - * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to - * (\p bin_growth ^ \p min_bin). - * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest - * bin and are simply freed when they are deallocated instead of being returned - * to a bin-cache. - * - %If the total storage of cached allocations on a given device will exceed - * \p max_cached_bytes, allocations for that device are simply freed when they are - * deallocated instead of being returned to their bin-cache. - * - * \par - * For example, the default-constructed CachingDeviceAllocator is configured with: - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = 6MB - 1B - * - * \par - * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB - * and sets a maximum of 6,291,455 cached bytes per device - * - */ -struct CachingDeviceAllocator -{ - - //--------------------------------------------------------------------- - // Constants - //--------------------------------------------------------------------- - - /// Out-of-bounds bin - static const unsigned int INVALID_BIN = (unsigned int) -1; - - /// Invalid size - static const size_t INVALID_SIZE = (size_t) -1; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - /// Invalid device ordinal - static const int INVALID_DEVICE_ORDINAL = -1; - - //--------------------------------------------------------------------- - // Type definitions and helper types - //--------------------------------------------------------------------- - - /** - * Descriptor for device memory allocations - */ - struct BlockDescriptor - { - void* d_ptr; // Device pointer - size_t bytes; // Size of allocation in bytes - unsigned int bin; // Bin enumeration - int device; // device ordinal - cudaStream_t associated_stream; // Associated associated_stream - cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed - - // Constructor (suitable for searching maps for a specific block, given its pointer and device) - BlockDescriptor(void *d_ptr, int device) : - d_ptr(d_ptr), - bytes(0), - bin(INVALID_BIN), - device(device), - associated_stream(0), - ready_event(0) - {} - - // Constructor (suitable for searching maps for a range of suitable blocks, given a device) - BlockDescriptor(int device) : - d_ptr(NULL), - bytes(0), - bin(INVALID_BIN), - device(device), - associated_stream(0), - ready_event(0) - {} - - // Comparison functor for comparing device pointers - static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) - { - if (a.device == b.device) - return (a.d_ptr < b.d_ptr); - else - return (a.device < b.device); - } - - // Comparison functor for comparing allocation sizes - static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) - { - if (a.device == b.device) - return (a.bytes < b.bytes); - else - return (a.device < b.device); - } - }; - - /// BlockDescriptor comparator function interface - typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); - - class TotalBytes { - public: - size_t free; - size_t live; - TotalBytes() { free = live = 0; } - }; - - /// Set type for cached blocks (ordered by size) - typedef std::multiset CachedBlocks; - - /// Set type for live blocks (ordered by ptr) - typedef std::multiset BusyBlocks; - - /// Map type of device ordinals to the number of cached bytes cached by each device - typedef std::map GpuCachedBytes; - - - //--------------------------------------------------------------------- - // Utility functions - //--------------------------------------------------------------------- - - /** - * Integer pow function for unsigned base and exponent - */ - static unsigned int IntPow( - unsigned int base, - unsigned int exp) - { - unsigned int retval = 1; - while (exp > 0) - { - if (exp & 1) { - retval = retval * base; // multiply the result by the current base - } - base = base * base; // square the base - exp = exp >> 1; // divide the exponent in half - } - return retval; - } - - - /** - * Round up to the nearest power-of - */ - void NearestPowerOf( - unsigned int &power, - size_t &rounded_bytes, - unsigned int base, - size_t value) - { - power = 0; - rounded_bytes = 1; - - if (value * base < value) - { - // Overflow - power = sizeof(size_t) * 8; - rounded_bytes = size_t(0) - 1; - return; - } - - while (rounded_bytes < value) - { - rounded_bytes *= base; - power++; - } - } - - - //--------------------------------------------------------------------- - // Fields - //--------------------------------------------------------------------- - - cub::Mutex mutex; /// Mutex for thread-safety - - unsigned int bin_growth; /// Geometric growth factor for bin-sizes - unsigned int min_bin; /// Minimum bin enumeration - unsigned int max_bin; /// Maximum bin enumeration - - size_t min_bin_bytes; /// Minimum bin size - size_t max_bin_bytes; /// Maximum bin size - size_t max_cached_bytes; /// Maximum aggregate cached bytes per device - - const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) - bool debug; /// Whether or not to print (de)allocation events to stdout - - GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device - CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse - BusyBlocks live_blocks; /// Set of live device allocations currently in use - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - //--------------------------------------------------------------------- - // Methods - //--------------------------------------------------------------------- - - /** - * \brief Constructor. - */ - CachingDeviceAllocator( - unsigned int bin_growth, ///< Geometric growth factor for bin-sizes - unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) - unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) - size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) - bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) - bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) - : - bin_growth(bin_growth), - min_bin(min_bin), - max_bin(max_bin), - min_bin_bytes(IntPow(bin_growth, min_bin)), - max_bin_bytes(IntPow(bin_growth, max_bin)), - max_cached_bytes(max_cached_bytes), - skip_cleanup(skip_cleanup), - debug(debug), - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare) - {} - - - /** - * \brief Default constructor. - * - * Configured with: - * \par - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes - * - * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and - * sets a maximum of 6,291,455 cached bytes per device - */ - CachingDeviceAllocator( - bool skip_cleanup = false, - bool debug = false) - : - bin_growth(8), - min_bin(3), - max_bin(7), - min_bin_bytes(IntPow(bin_growth, min_bin)), - max_bin_bytes(IntPow(bin_growth, max_bin)), - max_cached_bytes((max_bin_bytes * 3) - 1), - skip_cleanup(skip_cleanup), - debug(debug), - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare) - {} - - - /** - * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. - * - * Changing the ceiling of cached bytes does not cause any allocations (in-use or - * cached-in-reserve) to be freed. See \p FreeAllCached(). - */ - cudaError_t SetMaxCachedBytes( - size_t max_cached_bytes) - { - // Lock - mutex.Lock(); - - if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes); - - this->max_cached_bytes = max_cached_bytes; - - // Unlock - mutex.Unlock(); - - return cudaSuccess; - } - - - /** - * \brief Provides a suitable allocation of device memory for the given size on the specified device. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceAllocate( - int device, ///< [in] Device on which to place the allocation - void **d_ptr, ///< [out] Reference to pointer to the allocation - size_t bytes, ///< [in] Minimum number of bytes for the allocation - cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation - { - *d_ptr = NULL; - int entrypoint_device = INVALID_DEVICE_ORDINAL; - cudaError_t error = cudaSuccess; - - if (device == INVALID_DEVICE_ORDINAL) - { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; - device = entrypoint_device; - } - - // Create a block descriptor for the requested allocation - bool found = false; - BlockDescriptor search_key(device); - search_key.associated_stream = active_stream; - NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); - - if (search_key.bin > max_bin) - { - // Bin is greater than our maximum bin: allocate the request - // exactly and give out-of-bounds bin. It will not be cached - // for reuse when returned. - search_key.bin = INVALID_BIN; - search_key.bytes = bytes; - } - else - { - // Search for a suitable cached allocation: lock - mutex.Lock(); - - if (search_key.bin < min_bin) - { - // Bin is less than minimum bin: round up - search_key.bin = min_bin; - search_key.bytes = min_bin_bytes; - } - - // Iterate through the range of cached blocks on the same device in the same bin - CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); - while ((block_itr != cached_blocks.end()) - && (block_itr->device == device) - && (block_itr->bin == search_key.bin)) - { - // To prevent races with reusing blocks returned by the host but still - // in use by the device, only consider cached blocks that are - // either (from the active stream) or (from an idle stream) - if ((active_stream == block_itr->associated_stream) || - (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) - { - // Reuse existing cache block. Insert into live blocks. - found = true; - search_key = *block_itr; - search_key.associated_stream = active_stream; - live_blocks.insert(search_key); - - // Remove from free blocks - cached_bytes[device].free -= search_key.bytes; - cached_bytes[device].live += search_key.bytes; - - if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", - device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); - - cached_blocks.erase(block_itr); - - break; - } - block_itr++; - } - - // Done searching: unlock - mutex.Unlock(); - } - - // Allocate the block if necessary - if (!found) - { - // Set runtime's current device to specified device (entrypoint may not be set) - if (device != entrypoint_device) - { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; - if (CubDebug(error = cudaSetDevice(device))) return error; - } - - // Attempt to allocate - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) - { - // The allocation attempt failed: free all cached blocks on device and retry - if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", - device, (long long) search_key.bytes, (long long) search_key.associated_stream); - - error = cudaSuccess; // Reset the error we will return - cudaGetLastError(); // Reset CUDART's error - - // Lock - mutex.Lock(); - - // Iterate the range of free blocks on the same device - BlockDescriptor free_key(device); - CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); - - while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) - { - // No need to worry about synchronization with the device: cudaFree is - // blocking and will synchronize across all kernels executing - // on the current device - - // Free device memory and destroy stream event. - if (CubDebug(error = cudaFree(block_itr->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; - - // Reduce balance and erase entry - cached_bytes[device].free -= block_itr->bytes; - - if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", - device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); - - cached_blocks.erase(block_itr); - - block_itr++; - } - - // Unlock - mutex.Unlock(); - - // Return under error - if (error) return error; - - // Try to allocate again - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error; - } - - // Create ready event - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; - - // Insert into live blocks - mutex.Lock(); - live_blocks.insert(search_key); - cached_bytes[device].live += search_key.bytes; - mutex.Unlock(); - - if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", - device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); - - // Attempt to revert back to previous device if necessary - if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) - { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; - } - } - - // Copy device pointer to output parameter - *d_ptr = search_key.d_ptr; - - if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", - (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); - - return error; - } - - - /** - * \brief Provides a suitable allocation of device memory for the given size on the current device. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceAllocate( - void **d_ptr, ///< [out] Reference to pointer to the allocation - size_t bytes, ///< [in] Minimum number of bytes for the allocation - cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation - { - return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); - } - - - /** - * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceFree( - int device, - void* d_ptr) - { - int entrypoint_device = INVALID_DEVICE_ORDINAL; - cudaError_t error = cudaSuccess; - - if (device == INVALID_DEVICE_ORDINAL) - { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; - device = entrypoint_device; - } - - // Lock - mutex.Lock(); - - // Find corresponding block descriptor - bool recached = false; - BlockDescriptor search_key(d_ptr, device); - BusyBlocks::iterator block_itr = live_blocks.find(search_key); - if (block_itr != live_blocks.end()) - { - // Remove from live blocks - search_key = *block_itr; - live_blocks.erase(block_itr); - cached_bytes[device].live -= search_key.bytes; - - // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold - if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) - { - // Insert returned allocation into free blocks - recached = true; - cached_blocks.insert(search_key); - cached_bytes[device].free += search_key.bytes; - - if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", - device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), - (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); - } - } - - // Unlock - mutex.Unlock(); - - // First set to specified device (entrypoint may not be set) - if (device != entrypoint_device) - { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; - if (CubDebug(error = cudaSetDevice(device))) return error; - } - - if (recached) - { - // Insert the ready event in the associated stream (must have current device set properly) - if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; - } - else - { - // Free the allocation from the runtime and cleanup the event. - if (CubDebug(error = cudaFree(d_ptr))) return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; - - if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", - device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); - } - - // Reset device - if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) - { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; - } - - return error; - } - - - /** - * \brief Frees a live allocation of device memory on the current device, returning it to the allocator. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceFree( - void* d_ptr) - { - return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); - } - - - /** - * \brief Frees all cached device allocations on all devices - */ - cudaError_t FreeAllCached() - { - cudaError_t error = cudaSuccess; - int entrypoint_device = INVALID_DEVICE_ORDINAL; - int current_device = INVALID_DEVICE_ORDINAL; - - mutex.Lock(); - - while (!cached_blocks.empty()) - { - // Get first block - CachedBlocks::iterator begin = cached_blocks.begin(); - - // Get entry-point device ordinal if necessary - if (entrypoint_device == INVALID_DEVICE_ORDINAL) - { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; - } - - // Set current device ordinal if necessary - if (begin->device != current_device) - { - if (CubDebug(error = cudaSetDevice(begin->device))) break; - current_device = begin->device; - } - - // Free device memory - if (CubDebug(error = cudaFree(begin->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; - - // Reduce balance and erase entry - cached_bytes[current_device].free -= begin->bytes; - - if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", - current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live); - - cached_blocks.erase(begin); - } - - mutex.Unlock(); - - // Attempt to revert back to entry-point device if necessary - if (entrypoint_device != INVALID_DEVICE_ORDINAL) - { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; - } - - return error; - } - - - /** - * \brief Destructor - */ - virtual ~CachingDeviceAllocator() - { - if (!skip_cleanup) - FreeAllCached(); - } - -}; - - - - -/** @} */ // end group UtilMgmt - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_arch.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_arch.cuh deleted file mode 100644 index 5ec36e5..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_arch.cuh +++ /dev/null @@ -1,151 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Static architectural properties by SM version. - */ - -#pragma once - -#include "util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS) - #define CUB_USE_COOPERATIVE_GROUPS -#endif - -/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). -#ifndef CUB_PTX_ARCH - #ifndef __CUDA_ARCH__ - #define CUB_PTX_ARCH 0 - #else - #define CUB_PTX_ARCH __CUDA_ARCH__ - #endif -#endif - - -/// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. -#ifndef CUB_RUNTIME_FUNCTION - #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) - #define CUB_RUNTIME_ENABLED - #define CUB_RUNTIME_FUNCTION __host__ __device__ - #else - #define CUB_RUNTIME_FUNCTION __host__ - #endif -#endif - - -/// Number of threads per warp -#ifndef CUB_LOG_WARP_THREADS - #define CUB_LOG_WARP_THREADS(arch) \ - (5) - #define CUB_WARP_THREADS(arch) \ - (1 << CUB_LOG_WARP_THREADS(arch)) - - #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) - #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) -#endif - - -/// Number of smem banks -#ifndef CUB_LOG_SMEM_BANKS - #define CUB_LOG_SMEM_BANKS(arch) \ - ((arch >= 200) ? \ - (5) : \ - (4)) - #define CUB_SMEM_BANKS(arch) \ - (1 << CUB_LOG_SMEM_BANKS(arch)) - - #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) - #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) -#endif - - -/// Oversubscription factor -#ifndef CUB_SUBSCRIPTION_FACTOR - #define CUB_SUBSCRIPTION_FACTOR(arch) \ - ((arch >= 300) ? \ - (5) : \ - ((arch >= 200) ? \ - (3) : \ - (10))) - #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) -#endif - - -/// Prefer padding overhead vs X-way conflicts greater than this threshold -#ifndef CUB_PREFER_CONFLICT_OVER_PADDING - #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ - ((arch >= 300) ? \ - (1) : \ - (4)) - #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) -#endif - - -/// Scale down the number of warps to keep same amount of "tile" storage as the nominal configuration for 4B data. Minimum of two warps. -#ifndef CUB_BLOCK_THREADS - #define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ - (CUB_MIN( \ - NOMINAL_4B_BLOCK_THREADS * 2, \ - CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX( \ - (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 3 / 4, \ - (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T)))) -#endif - -/// Scale up/down number of items per thread to keep the same amount of "tile" storage as the nominal configuration for 4B data. Minimum 1 item per thread -#ifndef CUB_ITEMS_PER_THREAD - #define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ - (CUB_MIN( \ - NOMINAL_4B_ITEMS_PER_THREAD * 2, \ - CUB_MAX( \ - 1, \ - (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)))) -#endif - -/// Define both nominal threads-per-block and items-per-thread -#ifndef CUB_NOMINAL_CONFIG - #define CUB_NOMINAL_CONFIG(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T) \ - CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200), \ - CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200) -#endif - - - -#endif // Do not document - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_debug.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_debug.cuh deleted file mode 100644 index 1ad60cf..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_debug.cuh +++ /dev/null @@ -1,145 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Error and event logging routines. - * - * The following macros definitions are supported: - * - \p CUB_LOG. Simple event messages are printed to \p stdout. - */ - -#pragma once - -#include -#include "util_namespace.cuh" -#include "util_arch.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilMgmt - * @{ - */ - - -/// CUB error reporting macro (prints error messages to stderr) -#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) - #define CUB_STDERR -#endif - - - -/** - * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. - * - * \return The CUDA error. - */ -__host__ __device__ __forceinline__ cudaError_t Debug( - cudaError_t error, - const char* filename, - int line) -{ - (void)filename; - (void)line; -#ifdef CUB_STDERR - if (error) - { - #if (CUB_PTX_ARCH == 0) - fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); - fflush(stderr); - #elif (CUB_PTX_ARCH >= 200) - printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); - #endif - } -#endif - return error; -} - - -/** - * \brief Debug macro - */ -#ifndef CubDebug - #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__) -#endif - - -/** - * \brief Debug macro with exit - */ -#ifndef CubDebugExit - #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); } -#endif - - -/** - * \brief Log macro for printf statements. - */ -#if !defined(_CubLog) - #if !(defined(__clang__) && defined(__CUDA__)) - #if (CUB_PTX_ARCH == 0) - #define _CubLog(format, ...) printf(format,__VA_ARGS__); - #elif (CUB_PTX_ARCH >= 200) - #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__); - #endif - #else - // XXX shameless hack for clang around variadic printf... - // Compilies w/o supplying -std=c++11 but shows warning, - // so we sielence them :) - #pragma clang diagnostic ignored "-Wc++11-extensions" - #pragma clang diagnostic ignored "-Wunnamed-type-template-args" - template - inline __host__ __device__ void va_printf(char const* format, Args const&... args) - { - #ifdef __CUDA_ARCH__ - printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); - #else - printf(format, args...); - #endif - } - #ifndef __CUDA_ARCH__ - #define _CubLog(format, ...) va_printf(format,__VA_ARGS__); - #else - #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__); - #endif - #endif -#endif - - - - -/** @} */ // end group UtilMgmt - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_device.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_device.cuh deleted file mode 100644 index fa73dbd..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_device.cuh +++ /dev/null @@ -1,347 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Properties of a given CUDA device and the corresponding PTX bundle - */ - -#pragma once - -#include "util_type.cuh" -#include "util_arch.cuh" -#include "util_debug.cuh" -#include "util_namespace.cuh" -#include "util_macro.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilMgmt - * @{ - */ - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/** - * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). - */ -template -__host__ __device__ __forceinline__ -cudaError_t AliasTemporaries( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation - void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed - size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed -{ - const int ALIGN_BYTES = 256; - const int ALIGN_MASK = ~(ALIGN_BYTES - 1); - - // Compute exclusive prefix sum over allocation requests - size_t allocation_offsets[ALLOCATIONS]; - size_t bytes_needed = 0; - for (int i = 0; i < ALLOCATIONS; ++i) - { - size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; - allocation_offsets[i] = bytes_needed; - bytes_needed += allocation_bytes; - } - bytes_needed += ALIGN_BYTES - 1; - - // Check if the caller is simply requesting the size of the storage allocation - if (!d_temp_storage) - { - temp_storage_bytes = bytes_needed; - return cudaSuccess; - } - - // Check if enough storage provided - if (temp_storage_bytes < bytes_needed) - { - return CubDebug(cudaErrorInvalidValue); - } - - // Alias - d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); - for (int i = 0; i < ALLOCATIONS; ++i) - { - allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; - } - - return cudaSuccess; -} - - -/** - * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device - */ -template -__global__ void EmptyKernel(void) { } - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - -/** - * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10) - */ -CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) -{ - struct Dummy - { - /// Type definition of the EmptyKernel kernel entry point - typedef void (*EmptyKernelPtr)(); - - /// Force EmptyKernel to be generated if this class is used - CUB_RUNTIME_FUNCTION __forceinline__ - EmptyKernelPtr Empty() - { - return EmptyKernel; - } - }; - - -#ifndef CUB_RUNTIME_ENABLED - (void)ptx_version; - - // CUDA API calls not supported from this device - return cudaErrorInvalidConfiguration; - -#elif (CUB_PTX_ARCH > 0) - - ptx_version = CUB_PTX_ARCH; - return cudaSuccess; - -#else - - cudaError_t error = cudaSuccess; - do - { - cudaFuncAttributes empty_kernel_attrs; - if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel))) break; - ptx_version = empty_kernel_attrs.ptxVersion * 10; - } - while (0); - - return error; - -#endif -} - - -/** - * \brief Retrieves the SM version (major * 100 + minor * 10) - */ -CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal) -{ -#ifndef CUB_RUNTIME_ENABLED - (void)sm_version; - (void)device_ordinal; - - // CUDA API calls not supported from this device - return cudaErrorInvalidConfiguration; - -#else - - cudaError_t error = cudaSuccess; - do - { - // Fill in SM version - int major, minor; - if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break; - if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break; - sm_version = major * 100 + minor * 10; - } - while (0); - - return error; - -#endif -} - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Synchronize the stream if specified - */ -CUB_RUNTIME_FUNCTION __forceinline__ -static cudaError_t SyncStream(cudaStream_t stream) -{ -#if (CUB_PTX_ARCH == 0) - return cudaStreamSynchronize(stream); -#else - (void)stream; - // Device can't yet sync on a specific stream - return cudaDeviceSynchronize(); -#endif -} - - -/** - * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. - * - * \par Snippet - * The code snippet below illustrates the use of the MaxSmOccupancy function. - * \par - * \code - * #include // or equivalently - * - * template - * __global__ void ExampleKernel() - * { - * // Allocate shared memory for BlockScan - * __shared__ volatile T buffer[4096]; - * - * ... - * } - * - * ... - * - * // Determine SM occupancy for ExampleKernel specialized for unsigned char - * int max_sm_occupancy; - * MaxSmOccupancy(max_sm_occupancy, ExampleKernel, 64); - * - * // max_sm_occupancy <-- 4 on SM10 - * // max_sm_occupancy <-- 8 on SM20 - * // max_sm_occupancy <-- 12 on SM35 - * - * \endcode - * - */ -template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t MaxSmOccupancy( - int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM - KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy - int block_threads, ///< [in] Number of threads per thread block - int dynamic_smem_bytes = 0) -{ -#ifndef CUB_RUNTIME_ENABLED - (void)dynamic_smem_bytes; - (void)block_threads; - (void)kernel_ptr; - (void)max_sm_occupancy; - - // CUDA API calls not supported from this device - return CubDebug(cudaErrorInvalidConfiguration); - -#else - - return cudaOccupancyMaxActiveBlocksPerMultiprocessor ( - &max_sm_occupancy, - kernel_ptr, - block_threads, - dynamic_smem_bytes); - -#endif // CUB_RUNTIME_ENABLED -} - - -/****************************************************************************** - * Policy management - ******************************************************************************/ - -/** - * Kernel dispatch configuration - */ -struct KernelConfig -{ - int block_threads; - int items_per_thread; - int tile_size; - int sm_occupancy; - - CUB_RUNTIME_FUNCTION __forceinline__ - KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {} - - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Init(KernelPtrT kernel_ptr) - { - block_threads = AgentPolicyT::BLOCK_THREADS; - items_per_thread = AgentPolicyT::ITEMS_PER_THREAD; - tile_size = block_threads * items_per_thread; - cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads); - return retval; - } -}; - - - -/// Helper for dispatching into a policy chain -template -struct ChainedPolicy -{ - /// The policy for the active compiler pass - typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy; - - /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version - template - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Invoke(int ptx_version, FunctorT &op) - { - if (ptx_version < PTX_VERSION) { - return PrevPolicyT::Invoke(ptx_version, op); - } - return op.template Invoke(); - } -}; - -/// Helper for dispatching into a policy chain (end-of-chain specialization) -template -struct ChainedPolicy -{ - /// The policy for the active compiler pass - typedef PolicyT ActivePolicy; - - /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version - template - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) { - return op.template Invoke(); - } -}; - - - - -#endif // Do not document - - - - -/** @} */ // end group UtilMgmt - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_macro.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_macro.cuh deleted file mode 100644 index 73c29d2..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_macro.cuh +++ /dev/null @@ -1,103 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * Common C/C++ macro utilities - ******************************************************************************/ - -#pragma once - -#include "util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilModule - * @{ - */ - -#ifndef CUB_ALIGN - #if defined(_WIN32) || defined(_WIN64) - /// Align struct - #define CUB_ALIGN(bytes) __declspec(align(32)) - #else - /// Align struct - #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) - #endif -#endif - -#ifndef CUB_MAX - /// Select maximum(a, b) - #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) -#endif - -#ifndef CUB_MIN - /// Select minimum(a, b) - #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) -#endif - -#ifndef CUB_QUOTIENT_FLOOR - /// Quotient of x/y rounded down to nearest integer - #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) -#endif - -#ifndef CUB_QUOTIENT_CEILING - /// Quotient of x/y rounded up to nearest integer - #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) -#endif - -#ifndef CUB_ROUND_UP_NEAREST - /// x rounded up to the nearest multiple of y - #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) -#endif - -#ifndef CUB_ROUND_DOWN_NEAREST - /// x rounded down to the nearest multiple of y - #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) -#endif - - -#ifndef CUB_STATIC_ASSERT - #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - #define CUB_CAT_(a, b) a ## b - #define CUB_CAT(a, b) CUB_CAT_(a, b) - #endif // DOXYGEN_SHOULD_SKIP_THIS - - /// Static assert - #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] -#endif - -/** @} */ // end group UtilModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_namespace.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_namespace.cuh deleted file mode 100644 index edb6126..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_namespace.cuh +++ /dev/null @@ -1,46 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Place-holder for prefixing the cub namespace - */ - -#pragma once - -// For example: -//#define CUB_NS_PREFIX namespace thrust{ namespace detail { -//#define CUB_NS_POSTFIX } } - -#ifndef CUB_NS_PREFIX -#define CUB_NS_PREFIX -#endif - -#ifndef CUB_NS_POSTFIX -#define CUB_NS_POSTFIX -#endif diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_ptx.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_ptx.cuh deleted file mode 100644 index fae6e4f..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_ptx.cuh +++ /dev/null @@ -1,729 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * PTX intrinsics - */ - - -#pragma once - -#include "util_type.cuh" -#include "util_arch.cuh" -#include "util_namespace.cuh" -#include "util_debug.cuh" - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilPtx - * @{ - */ - - -/****************************************************************************** - * PTX helper macros - ******************************************************************************/ - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Register modifier for pointer-types (for inlining PTX assembly) - */ -#if defined(_WIN64) || defined(__LP64__) - #define __CUB_LP64__ 1 - // 64-bit register modifier for inlined asm - #define _CUB_ASM_PTR_ "l" - #define _CUB_ASM_PTR_SIZE_ "u64" -#else - #define __CUB_LP64__ 0 - // 32-bit register modifier for inlined asm - #define _CUB_ASM_PTR_ "r" - #define _CUB_ASM_PTR_SIZE_ "u32" -#endif - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/****************************************************************************** - * Inlined PTX intrinsics - ******************************************************************************/ - -/** - * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. - */ -__device__ __forceinline__ unsigned int SHR_ADD( - unsigned int x, - unsigned int shift, - unsigned int addend) -{ - unsigned int ret; -#if CUB_PTX_ARCH >= 200 - asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : - "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); -#else - ret = (x >> shift) + addend; -#endif - return ret; -} - - -/** - * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. - */ -__device__ __forceinline__ unsigned int SHL_ADD( - unsigned int x, - unsigned int shift, - unsigned int addend) -{ - unsigned int ret; -#if CUB_PTX_ARCH >= 200 - asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : - "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); -#else - ret = (x << shift) + addend; -#endif - return ret; -} - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Bitfield-extract. - */ -template -__device__ __forceinline__ unsigned int BFE( - UnsignedBits source, - unsigned int bit_start, - unsigned int num_bits, - Int2Type /*byte_len*/) -{ - unsigned int bits; -#if CUB_PTX_ARCH >= 200 - asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); -#else - const unsigned int MASK = (1 << num_bits) - 1; - bits = (source >> bit_start) & MASK; -#endif - return bits; -} - - -/** - * Bitfield-extract for 64-bit types. - */ -template -__device__ __forceinline__ unsigned int BFE( - UnsignedBits source, - unsigned int bit_start, - unsigned int num_bits, - Int2Type<8> /*byte_len*/) -{ - const unsigned long long MASK = (1ull << num_bits) - 1; - return (source >> bit_start) & MASK; -} - -#endif // DOXYGEN_SHOULD_SKIP_THIS - -/** - * \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type. - */ -template -__device__ __forceinline__ unsigned int BFE( - UnsignedBits source, - unsigned int bit_start, - unsigned int num_bits) -{ - return BFE(source, bit_start, num_bits, Int2Type()); -} - - -/** - * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. - */ -__device__ __forceinline__ void BFI( - unsigned int &ret, - unsigned int x, - unsigned int y, - unsigned int bit_start, - unsigned int num_bits) -{ -#if CUB_PTX_ARCH >= 200 - asm ("bfi.b32 %0, %1, %2, %3, %4;" : - "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); -#else - x <<= bit_start; - unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; - unsigned int MASK_Y = ~MASK_X; - ret = (y & MASK_Y) | (x & MASK_X); -#endif -} - - -/** - * \brief Three-operand add. Returns \p x + \p y + \p z. - */ -__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) -{ -#if CUB_PTX_ARCH >= 200 - asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); -#else - x = x + y + z; -#endif - return x; -} - - -/** - * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. - * - * \par - * The bytes in the two source registers \p a and \p b are numbered from 0 to 7: - * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes - * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within - * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0} - * - * \par Snippet - * The code snippet below illustrates byte-permute. - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * int a = 0x03020100; - * int b = 0x07060504; - * int index = 0x00007531; - * - * int selected = PRMT(a, b, index); // 0x07050301 - * - * \endcode - * - */ -__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index) -{ - int ret; - asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); - return ret; -} - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Sync-threads barrier. - */ -__device__ __forceinline__ void BAR(int count) -{ - asm volatile("bar.sync 1, %0;" : : "r"(count)); -} - -/** - * CTA barrier - */ -__device__ __forceinline__ void CTA_SYNC() -{ - __syncthreads(); -} - - -/** - * CTA barrier with predicate - */ -__device__ __forceinline__ int CTA_SYNC_AND(int p) -{ - return __syncthreads_and(p); -} - - -/** - * Warp barrier - */ -__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - __syncwarp(member_mask); -#endif -} - - -/** - * Warp any - */ -__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - return __any_sync(member_mask, predicate); -#else - return ::__any(predicate); -#endif -} - - -/** - * Warp any - */ -__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - return __all_sync(member_mask, predicate); -#else - return ::__all(predicate); -#endif -} - - -/** - * Warp ballot - */ -__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - return __ballot_sync(member_mask, predicate); -#else - return __ballot(predicate); -#endif -} - -/** - * Warp synchronous shfl_up - */ -__device__ __forceinline__ -unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int first_lane, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" - : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane), "r"(member_mask)); -#else - asm volatile("shfl.up.b32 %0, %1, %2, %3;" - : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane)); -#endif - return word; -} - -/** - * Warp synchronous shfl_down - */ -__device__ __forceinline__ -unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int last_lane, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" - : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane), "r"(member_mask)); -#else - asm volatile("shfl.down.b32 %0, %1, %2, %3;" - : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane)); -#endif - return word; -} - -/** - * Warp synchronous shfl_idx - */ -__device__ __forceinline__ -unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int last_lane, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" - : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane), "r"(member_mask)); -#else - asm volatile("shfl.idx.b32 %0, %1, %2, %3;" - : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane)); -#endif - return word; -} - -/** - * Floating point multiply. (Mantissa LSB rounds towards zero.) - */ -__device__ __forceinline__ float FMUL_RZ(float a, float b) -{ - float d; - asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); - return d; -} - - -/** - * Floating point multiply-add. (Mantissa LSB rounds towards zero.) - */ -__device__ __forceinline__ float FFMA_RZ(float a, float b, float c) -{ - float d; - asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); - return d; -} - -#endif // DOXYGEN_SHOULD_SKIP_THIS - -/** - * \brief Terminates the calling thread - */ -__device__ __forceinline__ void ThreadExit() { - asm volatile("exit;"); -} - - -/** - * \brief Abort execution and generate an interrupt to the host CPU - */ -__device__ __forceinline__ void ThreadTrap() { - asm volatile("trap;"); -} - - -/** - * \brief Returns the row-major linear thread identifier for a multidimensional thread block - */ -__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z) -{ - return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) + - ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + - threadIdx.x; -} - - -/** - * \brief Returns the warp lane ID of the calling thread - */ -__device__ __forceinline__ unsigned int LaneId() -{ - unsigned int ret; - asm ("mov.u32 %0, %%laneid;" : "=r"(ret) ); - return ret; -} - - -/** - * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block. - */ -__device__ __forceinline__ unsigned int WarpId() -{ - unsigned int ret; - asm ("mov.u32 %0, %%warpid;" : "=r"(ret) ); - return ret; -} - -/** - * \brief Returns the warp lane mask of all lanes less than the calling thread - */ -__device__ __forceinline__ unsigned int LaneMaskLt() -{ - unsigned int ret; - asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) ); - return ret; -} - -/** - * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread - */ -__device__ __forceinline__ unsigned int LaneMaskLe() -{ - unsigned int ret; - asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) ); - return ret; -} - -/** - * \brief Returns the warp lane mask of all lanes greater than the calling thread - */ -__device__ __forceinline__ unsigned int LaneMaskGt() -{ - unsigned int ret; - asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) ); - return ret; -} - -/** - * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread - */ -__device__ __forceinline__ unsigned int LaneMaskGe() -{ - unsigned int ret; - asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) ); - return ret; -} - -/** @} */ // end group UtilPtx - - - - -/** - * \brief Shuffle-up for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei-src_offset. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png) - * \ingroup WarpModule - * - * \par - * - Available only for SM3.0 or newer - * - * \par Snippet - * The code snippet below illustrates each thread obtaining a \p double value from the - * predecessor of its predecessor. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Obtain one input item per thread - * double thread_data = ... - * - * // Obtain item from two ranks below - * double peer_data = ShuffleUp(thread_data, 2, 0, 0xffffffff); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. - * The corresponding output \p peer_data will be {1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}. - * - */ -template -__device__ __forceinline__ T ShuffleUp( - T input, ///< [in] The value to broadcast - int src_offset, ///< [in] The relative down-offset of the peer to read from - int first_lane, ///< [in] Index of first lane in segment (typically 0) - unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes -{ - typedef typename UnitWord::ShuffleWord ShuffleWord; - - const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); - - T output; - ShuffleWord *output_alias = reinterpret_cast(&output); - ShuffleWord *input_alias = reinterpret_cast(&input); - - unsigned int shuffle_word; - shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_lane, member_mask); - output_alias[0] = shuffle_word; - - #pragma unroll - for (int WORD = 1; WORD < WORDS; ++WORD) - { - shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_lane, member_mask); - output_alias[WORD] = shuffle_word; - } - - return output; -} - - -/** - * \brief Shuffle-down for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei+src_offset. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png) - * \ingroup WarpModule - * - * \par - * - Available only for SM3.0 or newer - * - * \par Snippet - * The code snippet below illustrates each thread obtaining a \p double value from the - * successor of its successor. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Obtain one input item per thread - * double thread_data = ... - * - * // Obtain item from two ranks below - * double peer_data = ShuffleDown(thread_data, 2, 31, 0xffffffff); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. - * The corresponding output \p peer_data will be {3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}. - * - */ -template -__device__ __forceinline__ T ShuffleDown( - T input, ///< [in] The value to broadcast - int src_offset, ///< [in] The relative up-offset of the peer to read from - int last_lane, ///< [in] Index of first lane in segment (typically 31) - unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes -{ - typedef typename UnitWord::ShuffleWord ShuffleWord; - - const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); - - T output; - ShuffleWord *output_alias = reinterpret_cast(&output); - ShuffleWord *input_alias = reinterpret_cast(&input); - - unsigned int shuffle_word; - shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_lane, member_mask); - output_alias[0] = shuffle_word; - - #pragma unroll - for (int WORD = 1; WORD < WORDS; ++WORD) - { - shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_lane, member_mask); - output_alias[WORD] = shuffle_word; - } - - return output; -} - - -/** - * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input - * contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, - * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) - * - * \ingroup WarpModule - * - * \par - * - Available only for SM3.0 or newer - * - * \par Snippet - * The code snippet below illustrates each thread obtaining a \p double value from warp-lane0. - * - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Obtain one input item per thread - * double thread_data = ... - * - * // Obtain item from thread 0 - * double peer_data = ShuffleIndex(thread_data, 0, 32, 0xffffffff); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. - * The corresponding output \p peer_data will be {1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}. - * - */ -template -__device__ __forceinline__ T ShuffleIndex( - T input, ///< [in] The value to broadcast - int src_lane, ///< [in] Which warp lane is to do the broadcasting - int logical_warp_threads, ///< [in] Number of threads per logical warp - unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes -{ - typedef typename UnitWord::ShuffleWord ShuffleWord; - - const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); - - T output; - ShuffleWord *output_alias = reinterpret_cast(&output); - ShuffleWord *input_alias = reinterpret_cast(&input); - - unsigned int shuffle_word; - shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0], - src_lane, - logical_warp_threads - 1, - member_mask); - - output_alias[0] = shuffle_word; - - #pragma unroll - for (int WORD = 1; WORD < WORDS; ++WORD) - { - shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD], - src_lane, - logical_warp_threads - 1, - member_mask); - - output_alias[WORD] = shuffle_word; - } - - return output; -} - - - -/** - * Compute a 32b mask of threads having the same least-significant - * LABEL_BITS of \p label as the calling thread. - */ -template -inline __device__ unsigned int MatchAny(unsigned int label) -{ - unsigned int retval; - - // Extract masks of common threads for each bit - #pragma unroll - for (int BIT = 0; BIT < LABEL_BITS; ++BIT) - { - unsigned int mask; - unsigned int current_bit = 1 << BIT; - asm ("{\n" - " .reg .pred p;\n" - " and.b32 %0, %1, %2;" - " setp.eq.u32 p, %0, %2;\n" -#ifdef CUB_USE_COOPERATIVE_GROUPS - " vote.ballot.sync.b32 %0, p, 0xffffffff;\n" -#else - " vote.ballot.b32 %0, p;\n" -#endif - " @!p not.b32 %0, %0;\n" - "}\n" : "=r"(mask) : "r"(label), "r"(current_bit)); - - // Remove peers who differ - retval = (BIT == 0) ? mask : retval & mask; - } - - return retval; - -// // VOLTA match -// unsigned int retval; -// asm ("{\n" -// " match.any.sync.b32 %0, %1, 0xffffffff;\n" -// "}\n" : "=r"(retval) : "r"(label)); -// return retval; - -} - - - - - - - - - - - - - - - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_type.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_type.cuh deleted file mode 100644 index 7de5427..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/util_type.cuh +++ /dev/null @@ -1,1141 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Common type manipulation (metaprogramming) utilities - */ - -#pragma once - -#include -#include -#include - -#include "util_macro.cuh" -#include "util_arch.cuh" -#include "util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilModule - * @{ - */ - - - -/****************************************************************************** - * Type equality - ******************************************************************************/ - -/** - * \brief Type selection (IF ? ThenType : ElseType) - */ -template -struct If -{ - /// Conditional type result - typedef ThenType Type; // true -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct If -{ - typedef ElseType Type; // false -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/****************************************************************************** - * Conditional types - ******************************************************************************/ - -/** - * \brief Type equality test - */ -template -struct Equals -{ - enum { - VALUE = 0, - NEGATE = 1 - }; -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct Equals -{ - enum { - VALUE = 1, - NEGATE = 0 - }; -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/****************************************************************************** - * Static math - ******************************************************************************/ - -/** - * \brief Statically determine log2(N), rounded up. - * - * For example: - * Log2<8>::VALUE // 3 - * Log2<3>::VALUE // 2 - */ -template -struct Log2 -{ - /// Static logarithm value - enum { VALUE = Log2> 1), COUNT + 1>::VALUE }; // Inductive case -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct Log2 -{ - enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case - COUNT : - COUNT - 1 }; -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** - * \brief Statically determine if N is a power-of-two - */ -template -struct PowerOfTwo -{ - enum { VALUE = ((N & (N - 1)) == 0) }; -}; - - - -/****************************************************************************** - * Pointer vs. iterator detection - ******************************************************************************/ - -/** - * \brief Pointer vs. iterator - */ -template -struct IsPointer -{ - enum { VALUE = 0 }; -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct IsPointer -{ - enum { VALUE = 1 }; -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/****************************************************************************** - * Qualifier detection - ******************************************************************************/ - -/** - * \brief Volatile modifier test - */ -template -struct IsVolatile -{ - enum { VALUE = 0 }; -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct IsVolatile -{ - enum { VALUE = 1 }; -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/****************************************************************************** - * Qualifier removal - ******************************************************************************/ - -/** - * \brief Removes \p const and \p volatile qualifiers from type \p Tp. - * - * For example: - * typename RemoveQualifiers::Type // int; - */ -template -struct RemoveQualifiers -{ - /// Type without \p const and \p volatile qualifiers - typedef Up Type; -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct RemoveQualifiers -{ - typedef Up Type; -}; - -template -struct RemoveQualifiers -{ - typedef Up Type; -}; - -template -struct RemoveQualifiers -{ - typedef Up Type; -}; - - -/****************************************************************************** - * Marker types - ******************************************************************************/ - -/** - * \brief A simple "NULL" marker type - */ -struct NullType -{ -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - template - __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; } - - __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; } - - __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; } - -#endif // DOXYGEN_SHOULD_SKIP_THIS -}; - - -/** - * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values) - */ -template -struct Int2Type -{ - enum {VALUE = A}; -}; - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/****************************************************************************** - * Size and alignment - ******************************************************************************/ - -/// Structure alignment -template -struct AlignBytes -{ - struct Pad - { - T val; - char byte; - }; - - enum - { - /// The "true CUDA" alignment of T in bytes - ALIGN_BYTES = sizeof(Pad) - sizeof(T) - }; - - /// The "truly aligned" type - typedef T Type; -}; - -// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree -// with device C++ compilers (EDG) on types passed as template parameters through -// kernel functions - -#define __CUB_ALIGN_BYTES(t, b) \ - template <> struct AlignBytes \ - { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; }; - -__CUB_ALIGN_BYTES(short4, 8) -__CUB_ALIGN_BYTES(ushort4, 8) -__CUB_ALIGN_BYTES(int2, 8) -__CUB_ALIGN_BYTES(uint2, 8) -__CUB_ALIGN_BYTES(long long, 8) -__CUB_ALIGN_BYTES(unsigned long long, 8) -__CUB_ALIGN_BYTES(float2, 8) -__CUB_ALIGN_BYTES(double, 8) -#ifdef _WIN32 - __CUB_ALIGN_BYTES(long2, 8) - __CUB_ALIGN_BYTES(ulong2, 8) -#else - __CUB_ALIGN_BYTES(long2, 16) - __CUB_ALIGN_BYTES(ulong2, 16) -#endif -__CUB_ALIGN_BYTES(int4, 16) -__CUB_ALIGN_BYTES(uint4, 16) -__CUB_ALIGN_BYTES(float4, 16) -__CUB_ALIGN_BYTES(long4, 16) -__CUB_ALIGN_BYTES(ulong4, 16) -__CUB_ALIGN_BYTES(longlong2, 16) -__CUB_ALIGN_BYTES(ulonglong2, 16) -__CUB_ALIGN_BYTES(double2, 16) -__CUB_ALIGN_BYTES(longlong4, 16) -__CUB_ALIGN_BYTES(ulonglong4, 16) -__CUB_ALIGN_BYTES(double4, 16) - -template struct AlignBytes : AlignBytes {}; -template struct AlignBytes : AlignBytes {}; -template struct AlignBytes : AlignBytes {}; - - -/// Unit-words of data movement -template -struct UnitWord -{ - enum { - ALIGN_BYTES = AlignBytes::ALIGN_BYTES - }; - - template - struct IsMultiple - { - enum { - UNIT_ALIGN_BYTES = AlignBytes::ALIGN_BYTES, - IS_MULTIPLE = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0) - }; - }; - - /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T - typedef typename If::IS_MULTIPLE, - unsigned int, - typename If::IS_MULTIPLE, - unsigned short, - unsigned char>::Type>::Type ShuffleWord; - - /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T - typedef typename If::IS_MULTIPLE, - unsigned long long, - ShuffleWord>::Type VolatileWord; - - /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T - typedef typename If::IS_MULTIPLE, - ulonglong2, - VolatileWord>::Type DeviceWord; - - /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T - typedef typename If::IS_MULTIPLE, - uint4, - typename If::IS_MULTIPLE, - uint2, - ShuffleWord>::Type>::Type TextureWord; -}; - - -// float2 specialization workaround (for SM10-SM13) -template <> -struct UnitWord -{ - typedef int ShuffleWord; -#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) - typedef float VolatileWord; - typedef uint2 DeviceWord; -#else - typedef unsigned long long VolatileWord; - typedef unsigned long long DeviceWord; -#endif - typedef float2 TextureWord; -}; - -// float4 specialization workaround (for SM10-SM13) -template <> -struct UnitWord -{ - typedef int ShuffleWord; -#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) - typedef float VolatileWord; - typedef uint4 DeviceWord; -#else - typedef unsigned long long VolatileWord; - typedef ulonglong2 DeviceWord; -#endif - typedef float4 TextureWord; -}; - - -// char2 specialization workaround (for SM10-SM13) -template <> -struct UnitWord -{ - typedef unsigned short ShuffleWord; -#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) - typedef unsigned short VolatileWord; - typedef short DeviceWord; -#else - typedef unsigned short VolatileWord; - typedef unsigned short DeviceWord; -#endif - typedef unsigned short TextureWord; -}; - - -template struct UnitWord : UnitWord {}; -template struct UnitWord : UnitWord {}; -template struct UnitWord : UnitWord {}; - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/****************************************************************************** - * Vector type inference utilities. - ******************************************************************************/ - -/** - * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists. Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields. - */ -template struct CubVector; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -enum -{ - /// The maximum number of elements in CUDA vector types - MAX_VEC_ELEMENTS = 4, -}; - - -/** - * Generic vector-1 type - */ -template -struct CubVector -{ - T x; - - typedef T BaseType; - typedef CubVector Type; -}; - -/** - * Generic vector-2 type - */ -template -struct CubVector -{ - T x; - T y; - - typedef T BaseType; - typedef CubVector Type; -}; - -/** - * Generic vector-3 type - */ -template -struct CubVector -{ - T x; - T y; - T z; - - typedef T BaseType; - typedef CubVector Type; -}; - -/** - * Generic vector-4 type - */ -template -struct CubVector -{ - T x; - T y; - T z; - T w; - - typedef T BaseType; - typedef CubVector Type; -}; - - -/** - * Macro for expanding partially-specialized built-in vector types - */ -#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type) \ - \ - template<> struct CubVector : short_type##1 \ - { \ - typedef base_type BaseType; \ - typedef short_type##1 Type; \ - __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x + other.x; \ - return retval; \ - } \ - __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x - other.x; \ - return retval; \ - } \ - }; \ - \ - template<> struct CubVector : short_type##2 \ - { \ - typedef base_type BaseType; \ - typedef short_type##2 Type; \ - __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x + other.x; \ - retval.y = y + other.y; \ - return retval; \ - } \ - __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x - other.x; \ - retval.y = y - other.y; \ - return retval; \ - } \ - }; \ - \ - template<> struct CubVector : short_type##3 \ - { \ - typedef base_type BaseType; \ - typedef short_type##3 Type; \ - __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x + other.x; \ - retval.y = y + other.y; \ - retval.z = z + other.z; \ - return retval; \ - } \ - __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x - other.x; \ - retval.y = y - other.y; \ - retval.z = z - other.z; \ - return retval; \ - } \ - }; \ - \ - template<> struct CubVector : short_type##4 \ - { \ - typedef base_type BaseType; \ - typedef short_type##4 Type; \ - __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x + other.x; \ - retval.y = y + other.y; \ - retval.z = z + other.z; \ - retval.w = w + other.w; \ - return retval; \ - } \ - __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x - other.x; \ - retval.y = y - other.y; \ - retval.z = z - other.z; \ - retval.w = w - other.w; \ - return retval; \ - } \ - }; - - - -// Expand CUDA vector types for built-in primitives -CUB_DEFINE_VECTOR_TYPE(char, char) -CUB_DEFINE_VECTOR_TYPE(signed char, char) -CUB_DEFINE_VECTOR_TYPE(short, short) -CUB_DEFINE_VECTOR_TYPE(int, int) -CUB_DEFINE_VECTOR_TYPE(long, long) -CUB_DEFINE_VECTOR_TYPE(long long, longlong) -CUB_DEFINE_VECTOR_TYPE(unsigned char, uchar) -CUB_DEFINE_VECTOR_TYPE(unsigned short, ushort) -CUB_DEFINE_VECTOR_TYPE(unsigned int, uint) -CUB_DEFINE_VECTOR_TYPE(unsigned long, ulong) -CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong) -CUB_DEFINE_VECTOR_TYPE(float, float) -CUB_DEFINE_VECTOR_TYPE(double, double) -CUB_DEFINE_VECTOR_TYPE(bool, uchar) - -// Undefine macros -#undef CUB_DEFINE_VECTOR_TYPE - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/****************************************************************************** - * Wrapper types - ******************************************************************************/ - -/** - * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions - */ -template -struct Uninitialized -{ - /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T - typedef typename UnitWord::DeviceWord DeviceWord; - - enum - { - WORDS = sizeof(T) / sizeof(DeviceWord) - }; - - /// Backing storage - DeviceWord storage[WORDS]; - - /// Alias - __host__ __device__ __forceinline__ T& Alias() - { - return reinterpret_cast(*this); - } -}; - - -/** - * \brief A key identifier paired with a corresponding value - */ -template < - typename _Key, - typename _Value -#if defined(_WIN32) && !defined(_WIN64) - , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES) - , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES) -#endif // #if defined(_WIN32) && !defined(_WIN64) - > -struct KeyValuePair -{ - typedef _Key Key; ///< Key data type - typedef _Value Value; ///< Value data type - - Key key; ///< Item key - Value value; ///< Item value - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair() {} - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} - - /// Inequality operator - __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) - { - return (value != b.value) || (key != b.key); - } -}; - -#if defined(_WIN32) && !defined(_WIN64) - -/** - * Win32 won't do 16B alignment. This can present two problems for - * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members: - * 1) If a smaller-aligned item were to be listed first, the host compiler places the - * should-be-16B item at too early an offset (and disagrees with device compiler) - * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size - * of the struct wrong (and disagrees with device compiler) - * - * So we put the larger-should-be-aligned item first, and explicitly pad the - * end of the struct - */ - -/// Smaller key specialization -template -struct KeyValuePair -{ - typedef K Key; - typedef V Value; - - typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; - - Value value; // Value has larger would-be alignment and goes first - Key key; - Pad pad; - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair() {} - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} - - /// Inequality operator - __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) - { - return (value != b.value) || (key != b.key); - } -}; - - -/// Smaller value specialization -template -struct KeyValuePair -{ - typedef K Key; - typedef V Value; - - typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; - - Key key; // Key has larger would-be alignment and goes first - Value value; - Pad pad; - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair() {} - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} - - /// Inequality operator - __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) - { - return (value != b.value) || (key != b.key); - } -}; - -#endif // #if defined(_WIN32) && !defined(_WIN64) - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/** - * \brief A wrapper for passing simple static arrays as kernel parameters - */ -template -struct ArrayWrapper -{ - - /// Statically-sized array of type \p T - T array[COUNT]; - - /// Constructor - __host__ __device__ __forceinline__ ArrayWrapper() {} -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - -/** - * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth. - * - * Many multi-pass computations require a pair of "ping-pong" storage - * buffers (e.g., one for reading from and the other for writing to, and then - * vice-versa for the subsequent pass). This structure wraps a set of device - * buffers and a "selector" member to track which is "current". - */ -template -struct DoubleBuffer -{ - /// Pair of device buffer pointers - T *d_buffers[2]; - - /// Selector into \p d_buffers (i.e., the active/valid buffer) - int selector; - - /// \brief Constructor - __host__ __device__ __forceinline__ DoubleBuffer() - { - selector = 0; - d_buffers[0] = NULL; - d_buffers[1] = NULL; - } - - /// \brief Constructor - __host__ __device__ __forceinline__ DoubleBuffer( - T *d_current, ///< The currently valid buffer - T *d_alternate) ///< Alternate storage buffer of the same size as \p d_current - { - selector = 0; - d_buffers[0] = d_current; - d_buffers[1] = d_alternate; - } - - /// \brief Return pointer to the currently valid buffer - __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; } - - /// \brief Return pointer to the currently invalid buffer - __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; } - -}; - - - -/****************************************************************************** - * Typedef-detection - ******************************************************************************/ - - -/** - * \brief Defines a structure \p detector_name that is templated on type \p T. The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name - */ -#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \ - template \ - struct detector_name \ - { \ - template \ - static char& test(typename C::nested_type_name*); \ - template \ - static int& test(...); \ - enum \ - { \ - VALUE = sizeof(test(0)) < sizeof(int) \ - }; \ - }; - - - -/****************************************************************************** - * Simple enable-if (similar to Boost) - ******************************************************************************/ - -/** - * \brief Simple enable-if (similar to Boost) - */ -template -struct EnableIf -{ - /// Enable-if type for SFINAE dummy variables - typedef T Type; -}; - - -template -struct EnableIf {}; - - - -/****************************************************************************** - * Typedef-detection - ******************************************************************************/ - -/** - * \brief Determine whether or not BinaryOp's functor is of the form bool operator()(const T& a, const T&b) or bool operator()(const T& a, const T&b, unsigned int idx) - */ -template -struct BinaryOpHasIdxParam -{ -private: -/* - template struct SFINAE1 {}; - template struct SFINAE2 {}; - template struct SFINAE3 {}; - template struct SFINAE4 {}; -*/ - template struct SFINAE5 {}; - template struct SFINAE6 {}; - template struct SFINAE7 {}; - template struct SFINAE8 {}; -/* - template static char Test(SFINAE1 *); - template static char Test(SFINAE2 *); - template static char Test(SFINAE3 *); - template static char Test(SFINAE4 *); -*/ - template static char Test(SFINAE5 *); - template static char Test(SFINAE6 *); - template static char Test(SFINAE7 *); - template static char Test(SFINAE8 *); - - template static int Test(...); - -public: - - /// Whether the functor BinaryOp has a third unsigned int index param - static const bool HAS_PARAM = sizeof(Test(NULL)) == sizeof(char); -}; - - - - -/****************************************************************************** - * Simple type traits utilities. - * - * For example: - * Traits::CATEGORY // SIGNED_INTEGER - * Traits::NULL_TYPE // true - * Traits::CATEGORY // NOT_A_NUMBER - * Traits::PRIMITIVE; // false - * - ******************************************************************************/ - -/** - * \brief Basic type traits categories - */ -enum Category -{ - NOT_A_NUMBER, - SIGNED_INTEGER, - UNSIGNED_INTEGER, - FLOATING_POINT -}; - - -/** - * \brief Basic type traits - */ -template -struct BaseTraits -{ - /// Category - static const Category CATEGORY = _CATEGORY; - enum - { - PRIMITIVE = _PRIMITIVE, - NULL_TYPE = _NULL_TYPE, - }; -}; - - -/** - * Basic type traits (unsigned primitive specialization) - */ -template -struct BaseTraits -{ - typedef _UnsignedBits UnsignedBits; - - static const Category CATEGORY = UNSIGNED_INTEGER; - static const UnsignedBits LOWEST_KEY = UnsignedBits(0); - static const UnsignedBits MAX_KEY = UnsignedBits(-1); - - enum - { - PRIMITIVE = true, - NULL_TYPE = false, - }; - - - static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) - { - return key; - } - - static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) - { - return key; - } - - static __host__ __device__ __forceinline__ T Max() - { - UnsignedBits retval = MAX_KEY; - return reinterpret_cast(retval); - } - - static __host__ __device__ __forceinline__ T Lowest() - { - UnsignedBits retval = LOWEST_KEY; - return reinterpret_cast(retval); - } -}; - - -/** - * Basic type traits (signed primitive specialization) - */ -template -struct BaseTraits -{ - typedef _UnsignedBits UnsignedBits; - - static const Category CATEGORY = SIGNED_INTEGER; - static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); - static const UnsignedBits LOWEST_KEY = HIGH_BIT; - static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; - - enum - { - PRIMITIVE = true, - NULL_TYPE = false, - }; - - static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) - { - return key ^ HIGH_BIT; - }; - - static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) - { - return key ^ HIGH_BIT; - }; - - static __host__ __device__ __forceinline__ T Max() - { - UnsignedBits retval = MAX_KEY; - return reinterpret_cast(retval); - } - - static __host__ __device__ __forceinline__ T Lowest() - { - UnsignedBits retval = LOWEST_KEY; - return reinterpret_cast(retval); - } -}; - -template -struct FpLimits; - -template <> -struct FpLimits -{ - static __host__ __device__ __forceinline__ float Max() { - return FLT_MAX; - } - - static __host__ __device__ __forceinline__ float Lowest() { - return FLT_MAX * float(-1); - } -}; - -template <> -struct FpLimits -{ - static __host__ __device__ __forceinline__ double Max() { - return DBL_MAX; - } - - static __host__ __device__ __forceinline__ double Lowest() { - return DBL_MAX * double(-1); - } -}; - - -/** - * Basic type traits (fp primitive specialization) - */ -template -struct BaseTraits -{ - typedef _UnsignedBits UnsignedBits; - - static const Category CATEGORY = FLOATING_POINT; - static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); - static const UnsignedBits LOWEST_KEY = UnsignedBits(-1); - static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; - - enum - { - PRIMITIVE = true, - NULL_TYPE = false, - }; - - static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) - { - UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT; - return key ^ mask; - }; - - static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) - { - UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1); - return key ^ mask; - }; - - static __host__ __device__ __forceinline__ T Max() { - return FpLimits::Max(); - } - - static __host__ __device__ __forceinline__ T Lowest() { - return FpLimits::Lowest(); - } -}; - - -/** - * \brief Numeric type traits - */ -template struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; - - - -/** - * \brief Type traits - */ -template -struct Traits : NumericTraits::Type> {}; - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** @} */ // end group UtilModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_shfl.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_shfl.cuh deleted file mode 100644 index 682a5bf..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_shfl.cuh +++ /dev/null @@ -1,551 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../util_ptx.cuh" -#include "../../util_type.cuh" -#include "../../util_macro.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. - * - * LOGICAL_WARP_THREADS must be a power-of-two - */ -template < - typename T, ///< Data type being reduced - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpReduceShfl -{ - //--------------------------------------------------------------------- - // Constants and type definitions - //--------------------------------------------------------------------- - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// The number of warp reduction steps - STEPS = Log2::VALUE, - - /// Number of logical warps in a PTX warp - LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS, - }; - - template - struct IsInteger - { - enum { - ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange - IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) - }; - }; - - - // Creates a mask where the last thread in each logical warp is set - template - struct LastLaneMask - { - enum { - BASE_MASK = 1 << (LOGICAL_WARP_THREADS - 1), - MASK = (LastLaneMask::MASK << LOGICAL_WARP_THREADS) | BASE_MASK, - }; - }; - - // Creates a mask where the last thread in each logical warp is set - template - struct LastLaneMask - { - enum { - MASK = 1 << (LOGICAL_WARP_THREADS - 1), - }; - }; - - - - /// Shared memory storage layout type - typedef NullType TempStorage; - - - //--------------------------------------------------------------------- - // Thread fields - //--------------------------------------------------------------------- - - - unsigned int lane_id; - - unsigned int member_mask; - - //--------------------------------------------------------------------- - // Construction - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ WarpReduceShfl( - TempStorage &/*temp_storage*/) - : - lane_id(LaneId()), - - member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ? - 0 : // arch-width subwarps need not be tiled within the arch-warp - ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) - {} - - - //--------------------------------------------------------------------- - // Reduction steps - //--------------------------------------------------------------------- - - /// Reduction (specialized for summation across uint32 types) - __device__ __forceinline__ unsigned int ReduceStep( - unsigned int input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - unsigned int output; - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.down.b32 r0|p, %1, %2, %3;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input)); -#endif - - return output; - } - - - /// Reduction (specialized for summation across fp32 types) - __device__ __forceinline__ float ReduceStep( - float input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - float output; - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.down.b32 r0|p, %1, %2, %3;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input)); -#endif - - return output; - } - - - /// Reduction (specialized for summation across unsigned long long types) - __device__ __forceinline__ unsigned long long ReduceStep( - unsigned long long input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - unsigned long long output; - -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" - " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" - " mov.b64 %0, {lo, hi};" - " @p add.u64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.down.b32 lo|p, lo, %2, %3;" - " shfl.down.b32 hi|p, hi, %2, %3;" - " mov.b64 %0, {lo, hi};" - " @p add.u64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane)); -#endif - - return output; - } - - - /// Reduction (specialized for summation across long long types) - __device__ __forceinline__ long long ReduceStep( - long long input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - long long output; - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" - " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" - " mov.b64 %0, {lo, hi};" - " @p add.s64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.down.b32 lo|p, lo, %2, %3;" - " shfl.down.b32 hi|p, hi, %2, %3;" - " mov.b64 %0, {lo, hi};" - " @p add.s64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane)); -#endif - - return output; - } - - - /// Reduction (specialized for summation across double types) - __device__ __forceinline__ double ReduceStep( - double input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - double output; - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " .reg .f64 r0;" - " mov.b64 %0, %1;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" - " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" - " mov.b64 r0, {lo, hi};" - " @p add.f64 %0, %0, r0;" - "}" - : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " .reg .f64 r0;" - " mov.b64 %0, %1;" - " mov.b64 {lo, hi}, %1;" - " shfl.down.b32 lo|p, lo, %2, %3;" - " shfl.down.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.f64 %0, %0, r0;" - "}" - : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane)); -#endif - - return output; - } - - - /// Reduction (specialized for swizzled ReduceByKeyOp across KeyValuePair types) - template - __device__ __forceinline__ KeyValuePair ReduceStep( - KeyValuePair input, ///< [in] Calling thread's input item. - SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - KeyValuePair output; - - KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask); - - output.key = input.key; - output.value = ReduceStep( - input.value, - cub::Sum(), - last_lane, - offset, - Int2Type::IS_SMALL_UNSIGNED>()); - - if (input.key != other_key) - output.value = input.value; - - return output; - } - - - - /// Reduction (specialized for swizzled ReduceBySegmentOp across KeyValuePair types) - template - __device__ __forceinline__ KeyValuePair ReduceStep( - KeyValuePair input, ///< [in] Calling thread's input item. - SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - KeyValuePair output; - - output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); - output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); - - if (input.key > 0) - output.value = input.value; - - return output; - } - - - /// Reduction step (generic) - template - __device__ __forceinline__ _T ReduceStep( - _T input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - _T output = input; - - _T temp = ShuffleDown(output, offset, last_lane, member_mask); - - // Perform reduction op if valid - if (offset + lane_id <= last_lane) - output = reduction_op(input, temp); - - return output; - } - - - /// Reduction step (specialized for small unsigned integers size 32b or less) - template - __device__ __forceinline__ _T ReduceStep( - _T input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer - { - return ReduceStep(input, reduction_op, last_lane, offset); - } - - - /// Reduction step (specialized for types other than small unsigned integers size 32b or less) - template - __device__ __forceinline__ _T ReduceStep( - _T input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer - { - return ReduceStep(input, reduction_op, last_lane, offset); - } - - - //--------------------------------------------------------------------- - // Templated inclusive scan iteration - //--------------------------------------------------------------------- - - template - __device__ __forceinline__ void ReduceStep( - T& input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - Int2Type /*step*/) - { - input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); - - ReduceStep(input, reduction_op, last_lane, Int2Type()); - } - - template - __device__ __forceinline__ void ReduceStep( - T& /*input*/, ///< [in] Calling thread's input item. - ReductionOp /*reduction_op*/, ///< [in] Binary reduction operator - int /*last_lane*/, ///< [in] Index of last lane in segment - Int2Type /*step*/) - {} - - - //--------------------------------------------------------------------- - // Reduction operations - //--------------------------------------------------------------------- - - /// Reduction - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - // Get the lane of the first and last thread in the logical warp - int first_thread = 0; - int last_thread = LOGICAL_WARP_THREADS - 1; - if (!IS_ARCH_WARP) - { - first_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1)); - last_thread |= lane_id; - } - - // Common case is FOLDED_ITEMS_PER_LANE = 1 (or a multiple of 32) - int lanes_with_valid_data = (folded_items_per_warp - 1) / FOLDED_ITEMS_PER_LANE; - - // Get the last valid lane - int last_lane = (ALL_LANES_VALID) ? - last_thread : - CUB_MIN(last_thread, first_thread + lanes_with_valid_data); - - T output = input; - -// // Iterate reduction steps -// #pragma unroll -// for (int STEP = 0; STEP < STEPS; STEP++) -// { -// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); -// } - - // Template-iterate reduction steps - ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); - - return output; - } - - - /// Segmented reduction - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - // Get the start flags for each thread in the warp. - int warp_flags = WARP_BALLOT(flag, member_mask); - - // Convert to tail-segmented - if (HEAD_SEGMENTED) - warp_flags >>= 1; - - // Mask in the last lanes of each logical warp - warp_flags |= LastLaneMask<1, LOGICAL_WARPS>::MASK; - - // Mask out the bits below the current thread - warp_flags &= LaneMaskGe(); - - // Find the next set flag - int last_lane = __clz(__brev(warp_flags)); - - T output = input; - -// // Iterate reduction steps -// #pragma unroll -// for (int STEP = 0; STEP < STEPS; STEP++) -// { -// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); -// } - - // Template-iterate reduction steps - ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); - - return output; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_smem.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_smem.cuh deleted file mode 100644 index 9ba8e94..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_reduce_smem.cuh +++ /dev/null @@ -1,375 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../thread/thread_load.cuh" -#include "../../thread/thread_store.cuh" -#include "../../util_type.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. - */ -template < - typename T, ///< Data type being reduced - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpReduceSmem -{ - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = PowerOfTwo::VALUE, - - /// The number of warp scan steps - STEPS = Log2::VALUE, - - /// The number of threads in half a warp - HALF_WARP_THREADS = 1 << (STEPS - 1), - - /// The number of shared memory elements per warp - WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, - - /// FlagT status (when not using ballot) - UNSET = 0x0, // Is initially unset - SET = 0x1, // Is initially set - SEEN = 0x2, // Has seen another head flag from a successor peer - }; - - /// Shared memory flag type - typedef unsigned char SmemFlag; - - /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) - struct _TempStorage - { - T reduce[WARP_SMEM_ELEMENTS]; - SmemFlag flags[WARP_SMEM_ELEMENTS]; - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - _TempStorage &temp_storage; - unsigned int lane_id; - unsigned int member_mask; - - - /****************************************************************************** - * Construction - ******************************************************************************/ - - /// Constructor - __device__ __forceinline__ WarpReduceSmem( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS), - - member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? - 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp - ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) - {} - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - //--------------------------------------------------------------------- - // Regular reduction - //--------------------------------------------------------------------- - - /** - * Reduction step - */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane - typename ReductionOp, - int STEP> - __device__ __forceinline__ T ReduceStep( - T input, ///< [in] Calling thread's input - int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type /*step*/) - { - const int OFFSET = 1 << STEP; - - // Share input through buffer - ThreadStore(&temp_storage.reduce[lane_id], input); - - WARP_SYNC(member_mask); - - // Update input if peer_addend is in range - if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp)) - { - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); - input = reduction_op(input, peer_addend); - } - - WARP_SYNC(member_mask); - - return ReduceStep(input, folded_items_per_warp, reduction_op, Int2Type()); - } - - - /** - * Reduction step (terminate) - */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane - typename ReductionOp> - __device__ __forceinline__ T ReduceStep( - T input, ///< [in] Calling thread's input - int /*folded_items_per_warp*/, ///< [in] Total number of valid items folded into each logical warp - ReductionOp /*reduction_op*/, ///< [in] Reduction operator - Int2Type /*step*/) - { - return input; - } - - - //--------------------------------------------------------------------- - // Segmented reduction - //--------------------------------------------------------------------- - - - /** - * Ballot-based segmented reduce - */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality - { - // Get the start flags for each thread in the warp. - int warp_flags = WARP_BALLOT(flag, member_mask); - - if (!HEAD_SEGMENTED) - warp_flags <<= 1; - - // Keep bits above the current thread. - warp_flags &= LaneMaskGt(); - - // Accommodate packing of multiple logical warps in a single physical warp - if (!IS_ARCH_WARP) - { - warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; - } - - // Find next flag - int next_flag = __clz(__brev(warp_flags)); - - // Clip the next segment at the warp boundary if necessary - if (LOGICAL_WARP_THREADS != 32) - next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); - - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - const int OFFSET = 1 << STEP; - - // Share input into buffer - ThreadStore(&temp_storage.reduce[lane_id], input); - - WARP_SYNC(member_mask); - - // Update input if peer_addend is in range - if (OFFSET + lane_id < next_flag) - { - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); - input = reduction_op(input, peer_addend); - } - - WARP_SYNC(member_mask); - } - - return input; - } - - - /** - * Smem-based segmented reduce - */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality - { - enum - { - UNSET = 0x0, // Is initially unset - SET = 0x1, // Is initially set - SEEN = 0x2, // Has seen another head flag from a successor peer - }; - - // Alias flags onto shared data storage - volatile SmemFlag *flag_storage = temp_storage.flags; - - SmemFlag flag_status = (flag) ? SET : UNSET; - - for (int STEP = 0; STEP < STEPS; STEP++) - { - const int OFFSET = 1 << STEP; - - // Share input through buffer - ThreadStore(&temp_storage.reduce[lane_id], input); - - WARP_SYNC(member_mask); - - // Get peer from buffer - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); - - WARP_SYNC(member_mask); - - // Share flag through buffer - flag_storage[lane_id] = flag_status; - - // Get peer flag from buffer - SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET]; - - // Update input if peer was in range - if (lane_id < LOGICAL_WARP_THREADS - OFFSET) - { - if (HEAD_SEGMENTED) - { - // Head-segmented - if ((flag_status & SEEN) == 0) - { - // Has not seen a more distant head flag - if (peer_flag_status & SET) - { - // Has now seen a head flag - flag_status |= SEEN; - } - else - { - // Peer is not a head flag: grab its count - input = reduction_op(input, peer_addend); - } - - // Update seen status to include that of peer - flag_status |= (peer_flag_status & SEEN); - } - } - else - { - // Tail-segmented. Simply propagate flag status - if (!flag_status) - { - input = reduction_op(input, peer_addend); - flag_status |= peer_flag_status; - } - - } - } - } - - return input; - } - - - /****************************************************************************** - * Interface - ******************************************************************************/ - - /** - * Reduction - */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp - ReductionOp reduction_op) ///< [in] Reduction operator - { - return ReduceStep(input, folded_items_per_warp, reduction_op, Int2Type<0>()); - } - - - /** - * Segmented reduction - */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op) ///< [in] Reduction operator - { - return SegmentedReduce(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>()); - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_shfl.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_shfl.cuh deleted file mode 100644 index f0deb8d..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_shfl.cuh +++ /dev/null @@ -1,656 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../util_type.cuh" -#include "../../util_ptx.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - * - * LOGICAL_WARP_THREADS must be a power-of-two - */ -template < - typename T, ///< Data type being scanned - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpScanShfl -{ - //--------------------------------------------------------------------- - // Constants and type definitions - //--------------------------------------------------------------------- - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// The number of warp scan steps - STEPS = Log2::VALUE, - - /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up - SHFL_C = ((0xFFFFFFFFU << STEPS) & 31) << 8, - }; - - template - struct IntegerTraits - { - enum { - ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange - IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) - }; - }; - - /// Shared memory storage layout type - struct TempStorage {}; - - - //--------------------------------------------------------------------- - // Thread fields - //--------------------------------------------------------------------- - - unsigned int lane_id; - - unsigned int member_mask; - - //--------------------------------------------------------------------- - // Construction - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ WarpScanShfl( - TempStorage &/*temp_storage*/) - : - lane_id(LaneId()), - - member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ? - 0 : // arch-width subwarps need not be tiled within the arch-warp - ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) - {} - - - //--------------------------------------------------------------------- - // Inclusive scan steps - //--------------------------------------------------------------------- - - /// Inclusive prefix scan step (specialized for summation across int32 types) - __device__ __forceinline__ int InclusiveScanStep( - int input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - int output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .s32 r0;" - " .reg .pred p;" - " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" - " @p add.s32 r0, r0, %4;" - " mov.s32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .s32 r0;" - " .reg .pred p;" - " shfl.up.b32 r0|p, %1, %2, %3;" - " @p add.s32 r0, r0, %4;" - " mov.s32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); -#endif - - return output; - } - - /// Inclusive prefix scan step (specialized for summation across uint32 types) - __device__ __forceinline__ unsigned int InclusiveScanStep( - unsigned int input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - unsigned int output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.up.b32 r0|p, %1, %2, %3;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); -#endif - - return output; - } - - - /// Inclusive prefix scan step (specialized for summation across fp32 types) - __device__ __forceinline__ float InclusiveScanStep( - float input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - float output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.up.b32 r0|p, %1, %2, %3;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); -#endif - - return output; - } - - - /// Inclusive prefix scan step (specialized for summation across unsigned long long types) - __device__ __forceinline__ unsigned long long InclusiveScanStep( - unsigned long long input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - unsigned long long output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u64 r0;" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" - " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" - " mov.b64 r0, {lo, hi};" - " @p add.u64 r0, r0, %4;" - " mov.u64 %0, r0;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u64 r0;" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.u64 r0, r0, %4;" - " mov.u64 %0, r0;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); -#endif - - return output; - } - - - /// Inclusive prefix scan step (specialized for summation across long long types) - __device__ __forceinline__ long long InclusiveScanStep( - long long input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - long long output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .s64 r0;" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" - " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" - " mov.b64 r0, {lo, hi};" - " @p add.s64 r0, r0, %4;" - " mov.s64 %0, r0;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .s64 r0;" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.s64 r0, r0, %4;" - " mov.s64 %0, r0;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); -#endif - - return output; - } - - - /// Inclusive prefix scan step (specialized for summation across fp64 types) - __device__ __forceinline__ double InclusiveScanStep( - double input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - double output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " .reg .f64 r0;" - " mov.b64 %0, %1;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.up.b32 lo|p, lo, %2, %3, %4;" - " shfl.sync.up.b32 hi|p, hi, %2, %3, %4;" - " mov.b64 r0, {lo, hi};" - " @p add.f64 %0, %0, r0;" - "}" - : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " .reg .f64 r0;" - " mov.b64 %0, %1;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.f64 %0, %0, r0;" - "}" - : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); -#endif - - return output; - } - - -/* - /// Inclusive prefix scan (specialized for ReduceBySegmentOp across KeyValuePair types) - template - __device__ __forceinline__ KeyValuePairInclusiveScanStep( - KeyValuePair input, ///< [in] Calling thread's input item. - ReduceBySegmentOp scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - KeyValuePair output; - - output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); - output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); - - if (input.key > 0) - output.value = input.value; - - return output; - } -*/ - - /// Inclusive prefix scan step (generic) - template - __device__ __forceinline__ _T InclusiveScanStep( - _T input, ///< [in] Calling thread's input item. - ScanOpT scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - _T temp = ShuffleUp(input, offset, first_lane, member_mask); - - // Perform scan op if from a valid peer - _T output = scan_op(temp, input); - if (static_cast(lane_id) < first_lane + offset) - output = input; - - return output; - } - - - /// Inclusive prefix scan step (specialized for small integers size 32b or less) - template - __device__ __forceinline__ _T InclusiveScanStep( - _T input, ///< [in] Calling thread's input item. - ScanOpT scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer - { - return InclusiveScanStep(input, scan_op, first_lane, offset); - } - - - /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less) - template - __device__ __forceinline__ _T InclusiveScanStep( - _T input, ///< [in] Calling thread's input item. - ScanOpT scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer - { - return InclusiveScanStep(input, scan_op, first_lane, offset); - } - - //--------------------------------------------------------------------- - // Templated inclusive scan iteration - //--------------------------------------------------------------------- - - template - __device__ __forceinline__ void InclusiveScanStep( - _T& input, ///< [in] Calling thread's input item. - ScanOp scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - Int2Type /*step*/) ///< [in] Marker type indicating scan step - { - input = InclusiveScanStep(input, scan_op, first_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); - - InclusiveScanStep(input, scan_op, first_lane, Int2Type()); - } - - template - __device__ __forceinline__ void InclusiveScanStep( - _T& /*input*/, ///< [in] Calling thread's input item. - ScanOp /*scan_op*/, ///< [in] Binary scan operator - int /*first_lane*/, ///< [in] Index of first lane in segment - Int2Type /*step*/) ///< [in] Marker type indicating scan step - {} - - - /****************************************************************************** - * Interface - ******************************************************************************/ - - //--------------------------------------------------------------------- - // Broadcast - //--------------------------------------------------------------------- - - /// Broadcast - __device__ __forceinline__ T Broadcast( - T input, ///< [in] The value to broadcast - int src_lane) ///< [in] Which warp lane is to do the broadcasting - { - return ShuffleIndex(input, src_lane, LOGICAL_WARP_THREADS, member_mask); - } - - - //--------------------------------------------------------------------- - // Inclusive operations - //--------------------------------------------------------------------- - - /// Inclusive scan - template - __device__ __forceinline__ void InclusiveScan( - _T input, ///< [in] Calling thread's input item. - _T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOpT scan_op) ///< [in] Binary scan operator - { - inclusive_output = input; - - // Iterate scan steps - int segment_first_lane = 0; - - // Iterate scan steps -// InclusiveScanStep(inclusive_output, scan_op, segment_first_lane, Int2Type<0>()); - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - inclusive_output = InclusiveScanStep( - inclusive_output, - scan_op, - segment_first_lane, - (1 << STEP), - Int2Type::IS_SMALL_UNSIGNED>()); - } - - } - - /// Inclusive scan, specialized for reduce-value-by-key - template - __device__ __forceinline__ void InclusiveScan( - KeyValuePair input, ///< [in] Calling thread's input item. - KeyValuePair &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ReduceByKeyOp scan_op) ///< [in] Binary scan operator - { - inclusive_output = input; - - KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask); - - unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask); - - // Mask away all lanes greater than ours - ballot = ballot & LaneMaskLe(); - - // Find index of first set bit - int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot)); - - // Iterate scan steps -// InclusiveScanStep(inclusive_output.value, scan_op.op, segment_first_lane, Int2Type<0>()); - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - inclusive_output.value = InclusiveScanStep( - inclusive_output.value, - scan_op.op, - segment_first_lane, - (1 << STEP), - Int2Type::IS_SMALL_UNSIGNED>()); - } - } - - - /// Inclusive scan with aggregate - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOpT scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InclusiveScan(input, inclusive_output, scan_op); - - // Grab aggregate from last warp lane - warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask); - } - - - //--------------------------------------------------------------------- - // Get exclusive from inclusive - //--------------------------------------------------------------------- - - /// Update inclusive and exclusive using input and inclusive - template - __device__ __forceinline__ void Update( - T /*input*/, ///< [in] - T &inclusive, ///< [in, out] - T &exclusive, ///< [out] - ScanOpT /*scan_op*/, ///< [in] - IsIntegerT /*is_integer*/) ///< [in] - { - // initial value unknown - exclusive = ShuffleUp(inclusive, 1, 0, member_mask); - } - - /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) - __device__ __forceinline__ void Update( - T input, - T &inclusive, - T &exclusive, - cub::Sum /*scan_op*/, - Int2Type /*is_integer*/) - { - // initial value presumed 0 - exclusive = inclusive - input; - } - - /// Update inclusive and exclusive using initial value using input, inclusive, and initial value - template - __device__ __forceinline__ void Update ( - T /*input*/, - T &inclusive, - T &exclusive, - ScanOpT scan_op, - T initial_value, - IsIntegerT /*is_integer*/) - { - inclusive = scan_op(initial_value, inclusive); - exclusive = ShuffleUp(inclusive, 1, 0, member_mask); - - unsigned int segment_id = (IS_ARCH_WARP) ? - lane_id : - lane_id % LOGICAL_WARP_THREADS; - - if (segment_id == 0) - exclusive = initial_value; - } - - /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) - __device__ __forceinline__ void Update ( - T input, - T &inclusive, - T &exclusive, - cub::Sum scan_op, - T initial_value, - Int2Type /*is_integer*/) - { - inclusive = scan_op(initial_value, inclusive); - exclusive = inclusive - input; - } - - - /// Update inclusive, exclusive, and warp aggregate using input and inclusive - template - __device__ __forceinline__ void Update ( - T input, - T &inclusive, - T &exclusive, - T &warp_aggregate, - ScanOpT scan_op, - IsIntegerT is_integer) - { - warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask); - Update(input, inclusive, exclusive, scan_op, is_integer); - } - - /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value - template - __device__ __forceinline__ void Update ( - T input, - T &inclusive, - T &exclusive, - T &warp_aggregate, - ScanOpT scan_op, - T initial_value, - IsIntegerT is_integer) - { - warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask); - Update(input, inclusive, exclusive, scan_op, initial_value, is_integer); - } - - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_smem.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_smem.cuh deleted file mode 100644 index c3a7a94..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/specializations/warp_scan_smem.cuh +++ /dev/null @@ -1,397 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../thread/thread_load.cuh" -#include "../../thread/thread_store.cuh" -#include "../../util_type.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - */ -template < - typename T, ///< Data type being scanned - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpScanSmem -{ - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = PowerOfTwo::VALUE, - - /// The number of warp scan steps - STEPS = Log2::VALUE, - - /// The number of threads in half a warp - HALF_WARP_THREADS = 1 << (STEPS - 1), - - /// The number of shared memory elements per warp - WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, - }; - - /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars) - typedef typename If<((Equals::VALUE || Equals::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT; - - /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) - typedef CellT _TempStorage[WARP_SMEM_ELEMENTS]; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - _TempStorage &temp_storage; - unsigned int lane_id; - unsigned int member_mask; - - - /****************************************************************************** - * Construction - ******************************************************************************/ - - /// Constructor - __device__ __forceinline__ WarpScanSmem( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS), - - member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? - 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp - ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) - {} - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Basic inclusive scan iteration (template unrolled, inductive-case specialization) - template < - bool HAS_IDENTITY, - int STEP, - typename ScanOp> - __device__ __forceinline__ void ScanStep( - T &partial, - ScanOp scan_op, - Int2Type /*step*/) - { - const int OFFSET = 1 << STEP; - - // Share partial into buffer - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); - - WARP_SYNC(member_mask); - - // Update partial if addend is in range - if (HAS_IDENTITY || (lane_id >= OFFSET)) - { - T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); - partial = scan_op(addend, partial); - } - WARP_SYNC(member_mask); - - ScanStep(partial, scan_op, Int2Type()); - } - - - /// Basic inclusive scan iteration(template unrolled, base-case specialization) - template < - bool HAS_IDENTITY, - typename ScanOp> - __device__ __forceinline__ void ScanStep( - T &/*partial*/, - ScanOp /*scan_op*/, - Int2Type /*step*/) - {} - - - /// Inclusive prefix scan (specialized for summation across primitive types) - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - Sum scan_op, ///< [in] Binary scan operator - Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type - { - T identity = 0; - ThreadStore(&temp_storage[lane_id], (CellT) identity); - - WARP_SYNC(member_mask); - - // Iterate scan steps - output = input; - ScanStep(output, scan_op, Int2Type<0>()); - } - - - /// Inclusive prefix scan - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type - { - // Iterate scan steps - output = input; - ScanStep(output, scan_op, Int2Type<0>()); - } - - - /****************************************************************************** - * Interface - ******************************************************************************/ - - //--------------------------------------------------------------------- - // Broadcast - //--------------------------------------------------------------------- - - /// Broadcast - __device__ __forceinline__ T Broadcast( - T input, ///< [in] The value to broadcast - unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting - { - if (lane_id == src_lane) - { - ThreadStore(temp_storage, (CellT) input); - } - - WARP_SYNC(member_mask); - - return (T)ThreadLoad(temp_storage); - } - - - //--------------------------------------------------------------------- - // Inclusive operations - //--------------------------------------------------------------------- - - /// Inclusive scan - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op) ///< [in] Binary scan operator - { - InclusiveScan(input, inclusive_output, scan_op, Int2Type::PRIMITIVE>()); - } - - - /// Inclusive scan with aggregate - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InclusiveScan(input, inclusive_output, scan_op); - - // Retrieve aggregate - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); - - WARP_SYNC(member_mask); - - warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - - WARP_SYNC(member_mask); - } - - - //--------------------------------------------------------------------- - // Get exclusive from inclusive - //--------------------------------------------------------------------- - - /// Update inclusive and exclusive using input and inclusive - template - __device__ __forceinline__ void Update( - T /*input*/, ///< [in] - T &inclusive, ///< [in, out] - T &exclusive, ///< [out] - ScanOpT /*scan_op*/, ///< [in] - IsIntegerT /*is_integer*/) ///< [in] - { - // initial value unknown - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - - WARP_SYNC(member_mask); - - exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); - } - - /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) - __device__ __forceinline__ void Update( - T input, - T &inclusive, - T &exclusive, - cub::Sum /*scan_op*/, - Int2Type /*is_integer*/) - { - // initial value presumed 0 - exclusive = inclusive - input; - } - - /// Update inclusive and exclusive using initial value using input, inclusive, and initial value - template - __device__ __forceinline__ void Update ( - T /*input*/, - T &inclusive, - T &exclusive, - ScanOpT scan_op, - T initial_value, - IsIntegerT /*is_integer*/) - { - inclusive = scan_op(initial_value, inclusive); - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - - WARP_SYNC(member_mask); - - exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); - if (lane_id == 0) - exclusive = initial_value; - } - - /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) - __device__ __forceinline__ void Update ( - T input, - T &inclusive, - T &exclusive, - cub::Sum scan_op, - T initial_value, - Int2Type /*is_integer*/) - { - inclusive = scan_op(initial_value, inclusive); - exclusive = inclusive - input; - } - - - /// Update inclusive, exclusive, and warp aggregate using input and inclusive - template - __device__ __forceinline__ void Update ( - T /*input*/, - T &inclusive, - T &exclusive, - T &warp_aggregate, - ScanOpT /*scan_op*/, - IsIntegerT /*is_integer*/) - { - // Initial value presumed to be unknown or identity (either way our padding is correct) - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - - WARP_SYNC(member_mask); - - exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); - warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - } - - /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types) - __device__ __forceinline__ void Update ( - T input, - T &inclusive, - T &exclusive, - T &warp_aggregate, - cub::Sum /*scan_o*/, - Int2Type /*is_integer*/) - { - // Initial value presumed to be unknown or identity (either way our padding is correct) - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - - WARP_SYNC(member_mask); - - warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - exclusive = inclusive - input; - } - - /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value - template - __device__ __forceinline__ void Update ( - T /*input*/, - T &inclusive, - T &exclusive, - T &warp_aggregate, - ScanOpT scan_op, - T initial_value, - IsIntegerT /*is_integer*/) - { - // Broadcast warp aggregate - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - - WARP_SYNC(member_mask); - - warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - - WARP_SYNC(member_mask); - - // Update inclusive with initial value - inclusive = scan_op(initial_value, inclusive); - - // Get exclusive from exclusive - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive); - - WARP_SYNC(member_mask); - - exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 2]); - - if (lane_id == 0) - exclusive = initial_value; - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_reduce.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_reduce.cuh deleted file mode 100644 index ef78dd6..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_reduce.cuh +++ /dev/null @@ -1,612 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "specializations/warp_reduce_shfl.cuh" -#include "specializations/warp_reduce_smem.cuh" -#include "../thread/thread_operators.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup WarpModule - * @{ - */ - -/** - * \brief The WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png) - * - * \tparam T The reduction input/output element type - * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20). - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a list of input elements. - * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads) - * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS - * - * \par Performance Considerations - * - Uses special instructions when applicable (e.g., warp \p SHFL instructions) - * - Uses synchronization-free communication between warp lanes when applicable - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Summation (vs. generic reduction) - * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS - * - * \par Simple Examples - * \warpcollective{WarpReduce} - * \par - * The code snippet below illustrates four concurrent warp sum reductions within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for 4 warps - * __shared__ typename WarpReduce::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) - * int warp_id = threadIdx.x / 32; - * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. - * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, - * \p 2544, and \p 3568, respectively (and is undefined in other threads). - * - * \par - * The code snippet below illustrates a single warp sum reduction within a block of - * 128 threads. - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * ... - * - * // Only the first warp performs a reduction - * if (threadIdx.x < 32) - * { - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide sum to lane0 - * int aggregate = WarpReduce(temp_storage).Sum(thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the warp of threads is {0, 1, 2, 3, ..., 31}. - * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads). - * - */ -template < - typename T, - int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> -class WarpReduce -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = PowerOfTwo::VALUE, - }; - -public: - - #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - /// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) - typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), - WarpReduceShfl, - WarpReduceSmem >::Type InternalWarpReduce; - - #endif // DOXYGEN_SHOULD_SKIP_THIS - - -private: - - /// Shared memory storage layout type for WarpReduce - typedef typename InternalWarpReduce::TempStorage _TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - -public: - - /// \smemstorage{WarpReduce} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. - */ - __device__ __forceinline__ WarpReduce( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()) - {} - - - //@} end member group - /******************************************************************//** - * \name Summation reductions - *********************************************************************/ - //@{ - - - /** - * \brief Computes a warp-wide sum in the calling warp. The output is valid in warp lane0. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp sum reductions within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for 4 warps - * __shared__ typename WarpReduce::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide sums to each lane0 - * int warp_id = threadIdx.x / 32; - * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. - * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, - * \p 2544, and \p 3568, respectively (and is undefined in other threads). - * - */ - __device__ __forceinline__ T Sum( - T input) ///< [in] Calling thread's input - { - return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, cub::Sum()); - } - - /** - * \brief Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane0. - * - * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction within a single, partially-full - * block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(int *d_data, int valid_items) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item per thread if in range - * int thread_data; - * if (threadIdx.x < valid_items) - * thread_data = d_data[threadIdx.x]; - * - * // Return the warp-wide sums to each lane0 - * int aggregate = WarpReduce(temp_storage).Sum( - * thread_data, valid_items); - * - * \endcode - * \par - * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items - * is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is - * undefined in other threads). - * - */ - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input - int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) - { - // Determine if we don't need bounds checking - return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, cub::Sum()); - } - - - /** - * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a head-segmented warp sum - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int head_flag = ... - * - * // Return the warp-wide sums to each lane0 - * int aggregate = WarpReduce(temp_storage).HeadSegmentedSum( - * thread_data, head_flag); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p head_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - * - */ - template < - typename FlagT> - __device__ __forceinline__ T HeadSegmentedSum( - T input, ///< [in] Calling thread's input - FlagT head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment - { - return HeadSegmentedReduce(input, head_flag, cub::Sum()); - } - - - /** - * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a tail-segmented warp sum - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int tail_flag = ... - * - * // Return the warp-wide sums to each lane0 - * int aggregate = WarpReduce(temp_storage).TailSegmentedSum( - * thread_data, tail_flag); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p tail_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template < - typename FlagT> - __device__ __forceinline__ T TailSegmentedSum( - T input, ///< [in] Calling thread's input - FlagT tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment - { - return TailSegmentedReduce(input, tail_flag, cub::Sum()); - } - - - - //@} end member group - /******************************************************************//** - * \name Generic reductions - *********************************************************************/ - //@{ - - /** - * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp max reductions within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for 4 warps - * __shared__ typename WarpReduce::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide reductions to each lane0 - * int warp_id = threadIdx.x / 32; - * int aggregate = WarpReduce(temp_storage[warp_id]).Reduce( - * thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. - * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63, - * \p 95, and \p 127, respectively (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, reduction_op); - } - - /** - * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. - * - * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction within a single, partially-full - * block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(int *d_data, int valid_items) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item per thread if in range - * int thread_data; - * if (threadIdx.x < valid_items) - * thread_data = d_data[threadIdx.x]; - * - * // Return the warp-wide reductions to each lane0 - * int aggregate = WarpReduce(temp_storage).Reduce( - * thread_data, cub::Max(), valid_items); - * - * \endcode - * \par - * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items - * is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is - * undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op, ///< [in] Binary reduction operator - int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) - { - return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, reduction_op); - } - - - /** - * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a head-segmented warp max - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int head_flag = ... - * - * // Return the warp-wide reductions to each lane0 - * int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce( - * thread_data, head_flag, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p head_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template < - typename ReductionOp, - typename FlagT> - __device__ __forceinline__ T HeadSegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment - ReductionOp reduction_op) ///< [in] Reduction operator - { - return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); - } - - - /** - * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a tail-segmented warp max - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int tail_flag = ... - * - * // Return the warp-wide reductions to each lane0 - * int aggregate = WarpReduce(temp_storage).TailSegmentedReduce( - * thread_data, tail_flag, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p tail_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template < - typename ReductionOp, - typename FlagT> - __device__ __forceinline__ T TailSegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment - ReductionOp reduction_op) ///< [in] Reduction operator - { - return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); - } - - - - //@} end member group -}; - -/** @} */ // end group WarpModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_scan.cuh b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_scan.cuh deleted file mode 100644 index 3f78ca8..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/cub/warp/warp_scan.cuh +++ /dev/null @@ -1,936 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "specializations/warp_scan_shfl.cuh" -#include "specializations/warp_scan_smem.cuh" -#include "../thread/thread_operators.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup WarpModule - * @{ - */ - -/** - * \brief The WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. ![](warp_scan_logo.png) - * - * \tparam T The scan input/output element type - * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20). - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) - * produces an output list where each element is computed to be the reduction - * of the elements occurring earlier in the input list. Prefix sum - * connotes a prefix scan with the addition operator. The term \em inclusive indicates - * that the ith output reduction incorporates the ith input. - * The term \em exclusive indicates the ith input is not incorporated into - * the ith output reduction. - * - Supports non-commutative scan operators - * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads) - * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS - * - * \par Performance Considerations - * - Uses special instructions when applicable (e.g., warp \p SHFL) - * - Uses synchronization-free communication between warp lanes when applicable - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Summation (vs. generic scan) - * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS - * - * \par Simple Examples - * \warpcollective{WarpScan} - * \par - * The code snippet below illustrates four concurrent warp prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute warp-wide prefix sums - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 0, 1, 2, 3, ..., 31}. - * - * \par - * The code snippet below illustrates a single warp prefix sum within a block of - * 128 threads. - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for one warp - * __shared__ typename WarpScan::TempStorage temp_storage; - * ... - * - * // Only the first warp performs a prefix sum - * if (threadIdx.x < 32) - * { - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute warp-wide prefix sums - * WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the warp of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data will be {0, 1, 2, 3, ..., 31}. - * - */ -template < - typename T, - int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> -class WarpScan -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), - - /// Whether the data type is an integer (which has fully-associative addition) - IS_INTEGER = ((Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER)) - }; - - /// Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) - typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), - WarpScanShfl, - WarpScanSmem >::Type InternalWarpScan; - - /// Shared memory storage layout type for WarpScan - typedef typename InternalWarpScan::TempStorage _TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - unsigned int lane_id; - - - - /****************************************************************************** - * Public types - ******************************************************************************/ - -public: - - /// \smemstorage{WarpScan} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. - */ - __device__ __forceinline__ WarpScan( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS) - {} - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix sums - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive prefix sum across the calling warp. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix sums - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 1, 2, 3, ..., 32}. - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item. - T &inclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. - { - InclusiveScan(input, inclusive_output, cub::Sum()); - } - - - /** - * \brief Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix sums - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 1, 2, 3, ..., 32}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate); - } - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix sums - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. - * - * \par - * - \identityzero - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix sums - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 0, 1, 2, ..., 31}. - * - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item. - T &exclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. - { - T initial_value = 0; - ExclusiveScan(input, exclusive_output, initial_value, cub::Sum()); - } - - - /** - * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * \par - * - \identityzero - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix sums - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 0, 1, 2, ..., 31}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item. - T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - T initial_value = 0; - ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix scans - *********************************************************************/ - //@{ - - /** - * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op); - } - - - /** - * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix max scans - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveScan( - * thread_data, thread_data, cub::Max(), warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads - * in the second warp, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate); - } - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix scans - *********************************************************************/ - //@{ - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. - * (The output \p thread_data in warp lane0 is undefined.) - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan internal(temp_storage); - - T inclusive_output; - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - scan_op, - Int2Type()); - } - - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - T initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan internal(temp_storage); - - T inclusive_output; - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - scan_op, - initial_value, - Int2Type()); - } - - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. - * (The output \p thread_data in warp lane0 is undefined.) Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads - * in the second warp, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InternalWarpScan internal(temp_storage); - - T inclusive_output; - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - warp_aggregate, - scan_op, - Int2Type()); - } - - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. - * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads - * in the second warp, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - T initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InternalWarpScan internal(temp_storage); - - T inclusive_output; - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - warp_aggregate, - scan_op, - initial_value, - Int2Type()); - } - - - //@} end member group - /******************************************************************//** - * \name Combination (inclusive & exclusive) prefix scans - *********************************************************************/ - //@{ - - - /** - * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p exclusive_output computed for warp-lane0 is undefined. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int inclusive_partial, exclusive_partial; - * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p inclusive_partial in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * The corresponding output \p exclusive_partial in the first warp would be - * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. - * (The output \p thread_data in warp lane0 is undefined.) - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void Scan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. - T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan internal(temp_storage); - - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - scan_op, - Int2Type()); - } - - - /** - * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * int inclusive_partial, exclusive_partial; - * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p inclusive_partial in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * The corresponding output \p exclusive_partial in the first warp would be - * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void Scan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. - T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. - T initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan internal(temp_storage); - - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - scan_op, - initial_value, - Int2Type()); - } - - - - //@} end member group - /******************************************************************//** - * \name Data exchange - *********************************************************************/ - //@{ - - /** - * \brief Broadcast the value \p input from warp-lanesrc_lane to all lanes in the warp - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the warp-wide broadcasts of values from - * lanes0 in each of four warps to all other threads in those warps. - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Broadcast from lane0 in each warp to all other threads in the warp - * int warp_id = threadIdx.x / 32; - * thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. - * The corresponding output \p thread_data will be - * {0, 0, ..., 0} in warp0, - * {32, 32, ..., 32} in warp1, - * {64, 64, ..., 64} in warp2, etc. - */ - __device__ __forceinline__ T Broadcast( - T input, ///< [in] The value to broadcast - unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting - { - return InternalWarpScan(temp_storage).Broadcast(input, src_lane); - } - - //@} end member group - -}; - -/** @} */ // end group WarpModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/example.py b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/example.py deleted file mode 100644 index fedab49..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/example.py +++ /dev/null @@ -1,31 +0,0 @@ -# Compilation cmd: python setup.py install -# Alternative JIT compilation, refer to https://pytorch.org/tutorials/advanced/cpp_extension.html#jit-compiling-extensions -# from torch.utils.cpp_extension import load -# rdxtopk = load(name="rdxtopk", sources=["rdxtopk.cpp", "rdxtopk_cuda.cu"], verbose=True) - -import torch -import rdxtopk - - -size = 500000 -k = int(size * 0.01) -t = torch.randint(100000, [size], device='cuda').float() + torch.rand(size, dtype=torch.float32, device='cuda') - -indices = torch.arange(t.numel()).cuda().int() - -print("===== original tensor ======") -print(t) - -print("\n\n===== RadixTopK values not sorted ======") -out1, out2 = rdxtopk.topk(t, indices, k) -print(out1, out2) - -print("\n\n===== RadixTopK values sorted ======") -print(out1.sort(descending=True)[0], out2[out1.sort(descending=True)[1]]) - -print("\n\n===== Torch TopK values sorted ======") -out3, out4 = torch.topk(t, k) -print(out3, out4) - - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk.cpp b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk.cpp deleted file mode 100644 index a7c7097..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#include - -#include - -// CUDA forward declarations - -std::vector rdxtopk_cuda(torch::Tensor input,torch::Tensor indices, unsigned int k); - - -// C++ interface -std::vector topk(torch::Tensor input,torch::Tensor indices, unsigned int k) { - return rdxtopk_cuda(input,indices, k); -} - - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("topk", &topk, "Radix TopK Selection"); -} diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk_cuda.cu b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk_cuda.cu deleted file mode 100644 index 409fc4f..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/rdxtopk_cuda.cu +++ /dev/null @@ -1,582 +0,0 @@ -#include - -#include -#include -#include - -#include -#include - -#include "cub/device/device_radix_sort.cuh" -#include "cub/util_allocator.cuh" - -#include -#include -#include -#include -#include -#include -#include - -// #include "printFunctions.cuh" -// #include "generateProblems.cuh" -// #include "topk.h" - -using namespace std; -using namespace cub; -#define maxThreadsPerBlock 1024 - - -/** - * Computes the histogram over the digit values of an array of keys that MUST have a length of an integer multiple of (KPT * blockDim.x). - * The padding to the integer multiple can be done by adding 0's at the end and subtracting the number of padded 0's from the final result's 0 bin. - * The 2^NUM_BITS possible counts (0..2^NUM_BITSNUM_BITS-1) will be placed in global_histo. - * @param keys [IN] The keys for which to compute the histogram - * @param digit [IN] - * @param global_histo [OUT] The array of element counts, MUST be 256 in size. - * @param per_block_histo [OUT] - */ - template< - typename KeyT, // Data type of the keys within device memory. Data will be twiddled (if necessary) to unsigned type - typename IndexT, // Data type used for key's offsets and counters (limits number of supported keys, uint = 2^32) - int NUM_BITS, // Number of bits being sorted at a time - int KPT, // Number of keys per thread - int TPB, // Number of threads per block - int PRE_SORT_RUNS_LENGTH // For values greater than 1, this causes to sort a thread's keys by runs of a given length to improve run-length encoded updates to shared memory. -> -__global__ void rdxsrt_histogram(KeyT *__restrict__ keys, const uint digit, IndexT *global_histo) -{ - /*** TYPEDEFs***/ - typedef Traits KeyTraits; - typedef typename KeyTraits::UnsignedBits UnsignedBits; - /*typedef LoadUnit KeyLoader;*/ - - /*** DECLARATIONS ***/ - UnsignedBits tloc_keys[KPT]; // local keys in a thread - uint tloc_masked[KPT]; - __shared__ uint shared_bins[0x01<(keys, tloc_keys); - #pragma unroll - for (int i=0; i(keys)[block_offset + threadIdx.x + blockDim.x * i]; - } - -#if true || USE_RLE_HISTO - // Mask - #pragma unroll - for (int i=0; i>((sizeof(KeyT)*8)-(NUM_BITS*(digit+1))))&((0x01<>((sizeof(KeyT)*8)-(NUM_BITS*(digit+1))))&((0x01< -__global__ void rdxsrt_histogram_with_guards(KeyT *__restrict__ keys, const uint digit, IndexT *global_histo, const IndexT total_keys, const int block_index_offset) -{ - /*** TYPEDEFs***/ - typedef Traits KeyTraits; - typedef typename KeyTraits::UnsignedBits UnsignedBits; - /*typedef LoadUnit KeyLoader;*/ - - /*** DECLARATIONS ***/ - UnsignedBits tloc_keys[KPT]; - uint tloc_masked[KPT]; - __shared__ uint shared_bins[(0x01<(keys, tloc_keys, block_max_num_keys); - #pragma unroll - for (int i=0; i(keys)[block_offset + threadIdx.x + blockDim.x * i]; - } - } - - #pragma unroll - for(int i=0; i>((sizeof(KeyT)*8)-(NUM_BITS*(digit+1))))&((0x01< highest digit, 3 => lowest digit for 32-bit) -* @param digit_val [IN] Digit value. -* @param num_items [IN] Number of entries. -* @param d_keys_buffer [OUT] Entries with x[digit] = digit_val. -* @param d_keys_out [OUT] Entries with x[digit] > digit_val. -* @param d_values_buffer [OUT] Entry values with x[digit] = digit_val. -* @param d_values_out [OUT] Entry values with x[digit] > digit_val. -* @param d_index_buffer [OUT] Index into d_keys_buffer. -* @param d_index_out [OUT] Index into d_keys_out. -*/ -template< - typename KeyT, // Data type of the keys within device memory. Data will be twiddled (if necessary) to unsigned type - typename IndexT, // Data type used for key's offsets and counters (limits number of supported keys, uint = 2^32) - int NUM_BITS, // Number of bits being sorted at a time - int KPT, // Number of keys per thread - int TPB // Number of threads per block -> -__global__ void select_kth_bucket(KeyT* d_keys_in, unsigned int* d_values_in, const uint digit, const uint digit_val, uint num_items, - KeyT* d_keys_buffer, KeyT* d_keys_out, unsigned int* d_values_buffer, unsigned int* d_values_out, uint* d_index_buffer, uint* d_index_out) -{ - typedef Traits KeyTraits; - typedef typename KeyTraits::UnsignedBits UnsignedBits; - - // Specialize BlockLoad for a 1D block of TPB threads owning KPT integer items each - typedef cub::BlockLoad BlockLoadT; - - // Specialize BlockScan type for our thread block - typedef BlockScan BlockScanT; - - // in some sense, tile means block - const int tile_size = TPB * KPT; - int tile_idx = blockIdx.x; // Current tile index - int tile_offset = tile_idx * tile_size; - - // Allocate shared memory for BlockLoad - __shared__ union TempStorage - { - typename BlockLoadT::TempStorage load_items; - typename BlockScanT::TempStorage scan; - int offset[1]; - UnsignedBits raw_exchange[2 * TPB * KPT]; - } temp_storage; - - // Load a segment of consecutive items that are blocked across threads - UnsignedBits key_entries[KPT]; - unsigned int value_entries[KPT]; - - /*float payload_entries[KPT];*/ - int selection_flags[KPT]; - int selection_indices[KPT]; - - int num_tiles = (num_items + tile_size - 1) / tile_size; - int num_tile_items = tile_size; - bool is_last_tile = false; - if (tile_idx == num_tiles - 1) { - num_tile_items = num_items - tile_offset; - is_last_tile = true; - } - - // Load keys and values - if (is_last_tile) { - BlockLoadT(temp_storage.load_items).Load(reinterpret_cast(d_keys_in) + tile_offset, key_entries, num_tile_items); - __syncthreads(); - BlockLoadT(temp_storage.load_items).Load(reinterpret_cast(d_values_in) + tile_offset, value_entries, num_tile_items); - } - else { - BlockLoadT(temp_storage.load_items).Load(reinterpret_cast(d_keys_in) + tile_offset, key_entries); - __syncthreads(); - BlockLoadT(temp_storage.load_items).Load(reinterpret_cast(d_values_in) + tile_offset, value_entries); - } - - __syncthreads(); - - /*** Step 1: Find keys with digit value to selected digit value ***/ - #pragma unroll - for (int ITEM = 0; ITEM < KPT; ++ITEM) - { - // Out-of-bounds items are selection_flags - selection_flags[ITEM] = 0; - - if (!is_last_tile || (int(threadIdx.x * KPT) + ITEM < num_tile_items)) { - UnsignedBits key = KeyTraits::TwiddleIn(key_entries[ITEM]); - uint masked_key = (key>>((sizeof(KeyT)*8)-(NUM_BITS*(digit+1))))&((0x01< digit_val); - } - } - - __syncthreads(); - - // Compute exclusive prefix sum - int num_selected; - BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_selected); - - __syncthreads(); - - if (num_selected > 0) { - int index_out; - if (threadIdx.x == 0) { - // Find index into keys_out array - index_out = atomicAdd(d_index_out, num_selected); - temp_storage.offset[0] = index_out; - } - - __syncthreads(); - - index_out = temp_storage.offset[0]; - - __syncthreads(); - - // Compact and scatter items - #pragma unroll - for (int ITEM = 0; ITEM < KPT; ++ITEM) - { - int local_scatter_offset = selection_indices[ITEM]; - if (selection_flags[ITEM]) - { - temp_storage.raw_exchange[local_scatter_offset] = key_entries[ITEM]; - temp_storage.raw_exchange[tile_size + local_scatter_offset] = value_entries[ITEM]; - /*temp_storage.raw_exchange[tile_size + local_scatter_offset] = payload_entries[ITEM];*/ - } - } - - __syncthreads(); - - // Write out matched entries to output array - for (int item = threadIdx.x; item < num_selected; item += TPB) - { - reinterpret_cast(d_keys_out)[index_out + item] = temp_storage.raw_exchange[item]; - d_values_out[index_out + item] = temp_storage.raw_exchange[tile_size + item]; - } - - __syncthreads(); - -#if 0 - for (int item = threadIdx.x; item < num_selected; item += TPB) - { - payload_out[num_selections_prefix + item] = temp_storage.raw_exchange[tile_size + item]; - } -#endif - } - - /*** Step 2: Find entries that have digit equal to digit value ***/ - #pragma unroll - for (int ITEM = 0; ITEM < KPT; ++ITEM) - { - // Out-of-bounds items are selection_flags - selection_flags[ITEM] = 0; - - if (!is_last_tile || (int(threadIdx.x * KPT) + ITEM < num_tile_items)) { - UnsignedBits key = KeyTraits::TwiddleIn(key_entries[ITEM]); - uint masked_key = (key>>((sizeof(KeyT)*8)-(NUM_BITS*(digit+1))))&((0x01< 0) { - int index_buffer; - if (threadIdx.x == 0) { - index_buffer = atomicAdd(d_index_buffer, num_selected); - temp_storage.offset[0] = index_buffer; - } - - __syncthreads(); - - index_buffer = temp_storage.offset[0]; - - __syncthreads(); - - // Compact and scatter items - #pragma unroll - for (int ITEM = 0; ITEM < KPT; ++ITEM) - { - int local_scatter_offset = selection_indices[ITEM]; - if (selection_flags[ITEM]) - { - temp_storage.raw_exchange[local_scatter_offset] = key_entries[ITEM]; - temp_storage.raw_exchange[tile_size + local_scatter_offset] = value_entries[ITEM]; - /*temp_storage.raw_exchange[tile_size + local_scatter_offset] = payload_entries[ITEM];*/ - } - } - - __syncthreads(); - - // Write out output entries - for (int item = threadIdx.x; item < num_selected; item += TPB) - { - reinterpret_cast(d_keys_buffer)[index_buffer + item] = temp_storage.raw_exchange[item]; - d_values_buffer[index_buffer + item] = temp_storage.raw_exchange[tile_size + item]; - } - - __syncthreads(); - } -} - -__global__ void set_index_array(unsigned int* array, unsigned int len) { - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int gridSize = blockDim.x * gridDim.x; - - while (i < len) { - array[i] = i; - i += gridSize; - } -} - - -#define KPT 16 -#define TPB 384 -#define DIGIT_BITS 8 -cudaError_t CUDARadixSelectTopK(torch::Tensor d_keys_in, - torch::Tensor d_indices_in, - unsigned int num_items, - unsigned int k, - float *d_keys_out, - unsigned int *d_values_out) { - - cudaError error = cudaSuccess; - - // get helper buffers - // unsigned int *d_histogram = buf->histogram; - // unsigned int *d_index_out = buf->index_out; - // unsigned int *d_index_buffer = buf->index_buffer; - - // float* keys_double_buffer[2] = {buf->keys_buffer0, buf->keys_buffer1}; - // unsigned int* values_double_buffer[2] = {buf->value_buffer0, buf->value_buffer1}; - unsigned char current_keys_buffer = 0; - - //initialize buffer with empty tensor - //unsigned int *d_histogram = (uint*)torch::zeros(256*128, torch::TensorOptions().dtype(torch::kInt).device(d_keys_in.device())).data_ptr(); - //unsigned int *d_index_out = (uint*)torch::zeros(128, torch::TensorOptions().dtype(torch::kInt).device(d_keys_in.device())).data_ptr(); - //unsigned int *d_index_buffer = (uint*)torch::zeros(128, torch::TensorOptions().dtype(torch::kInt).device(d_keys_in.device())).data_ptr(); - unsigned int *d_histogram, *d_index_out, *d_index_buffer; - cudaMalloc(&d_histogram, 256*128); - cudaMalloc(&d_index_out, 128); - cudaMalloc(&d_index_buffer, 128); - - torch::Tensor keys_double_tensor[2] = {d_keys_in.clone(), d_keys_in.clone()}; - torch::Tensor indices_double_tensor[2] = {d_indices_in.clone(), d_indices_in.clone()}; - float* keys_double_buffer[2] = {(float*)keys_double_tensor[0].data_ptr(), (float*)keys_double_tensor[1].data_ptr()}; - unsigned int* values_double_buffer[2] = {(unsigned int*)indices_double_tensor[0].data_ptr(), (unsigned int*)indices_double_tensor[1].data_ptr()}; - //float* keys_double_buffer[2] = {(float*)d_keys_in.clone().data_ptr(), - // (float*)d_keys_in.clone().data_ptr()}; - //unsigned int* values_double_buffer[2] = {(uint*)d_indices_in.clone().data_ptr(), - // (uint*)d_indices_in.clone().data_ptr()}; - - // Set the index into output array to 0. - cudaMemset(d_index_out, 0, 4); - - unsigned int KPB = KPT * TPB; - - unsigned int *h_histogram = new unsigned int[256]; - - // set value array (index) - // int blocksPerGrid = (int) ceil(1.0 * num_items / TPB); - // set_index_array<<>>(values_double_buffer[current_keys_buffer], num_items); - - - // enumerate each digit (32-bit data (float32) / 8-bit/pass, so that's 4 digit in total) - for (unsigned int digit = 0; digit < 4; digit++) { - unsigned int num_blocks = num_items / KPB;// Pass-0 rough processing blocks (floor on purpose) - unsigned int processed_elements = num_blocks * KPB;// Pass-0 number of rough processed elements - unsigned int remaining_elements = num_items - processed_elements;// Do the remaining elements with a check in the inner loop - unsigned int remainder_blocks = (KPB - 1 + remaining_elements) / KPB;// Number of blocks required for remaining elements (typically 0 or 1) - - /******************************************************************************************/ - /* Caluclate Histogram */ - /******************************************************************************************/ - // Zero out the histogram - cudaMemset(d_histogram, 0, 256 * sizeof(int)); - - float* d_current_keys_in = keys_double_buffer[current_keys_buffer]; - unsigned int* d_current_value_in = values_double_buffer[current_keys_buffer]; - if (num_blocks > 0) - rdxsrt_histogram<<>>(d_current_keys_in, digit, d_histogram); - if (remaining_elements > 0) - rdxsrt_histogram_with_guards<<>>(d_current_keys_in, digit, d_histogram, num_items, num_blocks); - - /******************************************************************************************/ - /* Find the bin which contains the Kth largest element */ - /******************************************************************************************/ - cudaMemcpy(h_histogram, d_histogram, 256 * sizeof(uint), cudaMemcpyDeviceToHost); - - // currently we find the bin on host, hence we need to synchronize the stream - // cudaStreamSynchronize(stream); - - unsigned int rolling_sum = 0; - unsigned int digit_val; - for (int i = 255; i >= 0; i--) { - if ((rolling_sum + h_histogram[i]) > k) { - digit_val = i; - k -= rolling_sum; - break; - } - rolling_sum += h_histogram[i]; - } - - cudaMemset(d_index_buffer, 0, 4); - select_kth_bucket<<>>(d_current_keys_in, - d_current_value_in, - digit, - digit_val, - num_items, - keys_double_buffer[1-current_keys_buffer], - d_keys_out, - values_double_buffer[1-current_keys_buffer], - d_values_out, - d_index_buffer, - d_index_out); - uint h_index_out; - uint h_index_buffer; - - cudaMemcpy(&h_index_out, d_index_out, sizeof(uint), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_index_buffer, d_index_buffer, sizeof(uint), cudaMemcpyDeviceToHost); - - // cudaStreamSynchronize(stream); - // Update number of items to reflect reduced number of elements. - num_items = h_index_buffer; - - if (k == 0) break; - else if (k != 0 && digit == 3) { - // We are at last digit and k != 4 implies that kth value has repetition. - // Copy any of the repeated values(and keys!) to out array to complete the array. - cudaMemcpy(d_keys_out + h_index_out, keys_double_buffer[1-current_keys_buffer] ,k * sizeof(float), cudaMemcpyDeviceToDevice); - cudaMemcpy(d_values_out + h_index_out, values_double_buffer[1-current_keys_buffer], k * sizeof(float), cudaMemcpyDeviceToDevice); - k -= k; - } - - current_keys_buffer = 1 - current_keys_buffer; - } - delete[] h_histogram; - cudaFree(d_histogram); - cudaFree(d_index_out); - cudaFree(d_index_buffer); -} - - -// __global__ void _Uint32ToInt32(int *dst_data, -// unsigned int *src_data, -// unsigned int n) -// { -// // set thread ID -// unsigned int tid = threadIdx.x; -// unsigned int gridSize = blockDim.x * gridDim.x; -// unsigned int i = blockIdx.x * blockDim.x + tid; -// unsigned int blockSize = blockDim.x; - -// while (i < n) { -// dst_data[i] = (int)src_data[i]; -// i += gridSize; -// } -// } - -// void Uint32ToInt32(int *dst_data, -// unsigned int *src_data, -// unsigned int num_elements) -// { -// int blocksPerGrid = (int) ceil(1.0 * num_elements / maxThreadsPerBlock); -// _Uint32ToInt32<<>>(dst_data, src_data, num_elements); -// } - - - -std::vector rdxtopk_cuda( - torch::Tensor input,torch::Tensor indices, unsigned int k) { - - unsigned int num_items = input.numel(); - - auto d_keys_out = torch::zeros(k, torch::TensorOptions().dtype(torch::kFloat32).device(input.device())); - auto d_values_out = torch::zeros(k, torch::TensorOptions().dtype(torch::kInt).device(input.device())); - - CUDARadixSelectTopK(input,indices, - num_items, - k, - (float*)d_keys_out.data_ptr(), - (uint*)d_values_out.data_ptr()); - - // Uint32ToInt32((int*)d_values_out.data_ptr(), (uint*)d_values_out.data_ptr(), k); - -return {d_keys_out, d_values_out}; -} - - diff --git a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/setup.py b/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/setup.py deleted file mode 100644 index 49de7d7..0000000 --- a/untitled folder/grace_dl/dist/compressor/radixtopk_cuda/setup.py +++ /dev/null @@ -1,14 +0,0 @@ -from setuptools import setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -setup( - name='rdxtopk', - ext_modules=[ - CUDAExtension('rdxtopk', [ - 'rdxtopk.cpp', - 'rdxtopk_cuda.cu', - ]), - ], - cmdclass={ - 'build_ext': BuildExtension - }) diff --git a/untitled folder/grace_dl/dist/compressor/randomk.py b/untitled folder/grace_dl/dist/compressor/randomk.py deleted file mode 100644 index c620945..0000000 --- a/untitled folder/grace_dl/dist/compressor/randomk.py +++ /dev/null @@ -1,41 +0,0 @@ -import torch - -from grace_dl.dist import Compressor - - -def sparsify(tensor, compress_ratio): - tensor = tensor.flatten() - numel = tensor.numel() - k = max(1, int(numel * compress_ratio)) - # indices = torch.randperm(numel, device=tensor.device)[:k] - indices = torch.randint(numel, [k], device=tensor.device) - values = tensor[indices] - return indices, values - - -class RandomKCompressor(Compressor): - """Python libraries Based Compress by performing sparsification (i.e., sending a ratio of the actual tensor size.""" - - def __init__(self, compress_ratio): - super().__init__() - self.global_step = 0 - self.compress_ratio = compress_ratio - - def compress(self, tensor, name): - """Use Python Random libraries RNG to compress by generating a list of indices to be transmitted.""" - - h = sum(bytes(name, encoding='utf8'), self.global_step) - self.global_step += 1 - torch.manual_seed(h) - indices, values = sparsify(tensor, self.compress_ratio) - - ctx = indices, tensor.numel(), tensor.size() - return [values], ctx - - def decompress(self, tensors, ctx): - """Decompress by filling empty slots with zeros and reshape back using the original shape""" - indices, numel, shape = ctx - values, = tensors - tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) - tensor_decompressed.scatter_(0, indices, values) - return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/dist/compressor/signsgd.py b/untitled folder/grace_dl/dist/compressor/signsgd.py deleted file mode 100644 index e2e75eb..0000000 --- a/untitled folder/grace_dl/dist/compressor/signsgd.py +++ /dev/null @@ -1,30 +0,0 @@ -import torch - -from grace_dl.dist import Compressor - - -class SignSGDCompressor(Compressor): - - def __init__(self): - super().__init__(average=False) - - def compress(self, tensor, name): - shape = tensor.size() - tensor = tensor.flatten() - - tensor_compressed = tensor >= 0 - return [tensor_compressed.type(torch.uint8)], shape - - def decompress(self, tensors, shape): - """Decoding the signs to float format """ - sign_encode, = tensors - sign_decode = sign_encode.type(torch.float32) * 2 - 1 - tensor_decompressed = sign_decode.view(shape) - return tensor_decompressed - - def aggregate(self, tensors): - """Aggregate a list of tensors.""" - agged_tensor = sum(tensors) - sign = agged_tensor >= 0 - agged_tensor = sign * 2.0 - 1.0 - return agged_tensor diff --git a/untitled folder/grace_dl/dist/compressor/signum.py b/untitled folder/grace_dl/dist/compressor/signum.py deleted file mode 100644 index 2b70bd5..0000000 --- a/untitled folder/grace_dl/dist/compressor/signum.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch - -from grace_dl.dist import Compressor - - -class SignumCompressor(Compressor): - - def __init__(self, momentum): - super().__init__(average=False) - self.momentum = momentum - self.momentums = {} - - def compress(self, tensor, name): - """Encoding and compressing the signs """ - shape = tensor.size() - tensor = tensor.flatten() - - # update tensor by momentum - if name in self.momentums: - tensor = (1.0 - self.momentum) * tensor + self.momentum * self.momentums[name] - self.momentums[name] = tensor - tensor_compressed = tensor >= 0 - return [tensor_compressed.type(torch.uint8)], shape - - def decompress(self, tensors, shape): - sign_encode, = tensors - """Decoding the signs to float format """ - sign_decode = sign_encode.type(torch.float32) * 2 - 1 - tensor_decompressed = sign_decode.view(shape) - return tensor_decompressed - - def aggregate(self, tensors): - """Aggregate a list of tensors.""" - agged_tensor = sum(tensors) - agged_tensor = agged_tensor >= 0 - agged_tensor = agged_tensor * 2.0 - 1.0 - return agged_tensor diff --git a/untitled folder/grace_dl/dist/compressor/terngrad.py b/untitled folder/grace_dl/dist/compressor/terngrad.py deleted file mode 100644 index c5267fc..0000000 --- a/untitled folder/grace_dl/dist/compressor/terngrad.py +++ /dev/null @@ -1,32 +0,0 @@ -import torch -from grace_dl.dist import Compressor - - -class TernGradCompressor(Compressor): - - def compress(self, tensor, name): - shape = tensor.size() - tensor = tensor.flatten() - - std = (tensor - torch.mean(tensor)) ** 2 - std = torch.sqrt(torch.mean(std)) - c = 2.5 * std.item() - gradient = torch.clamp(tensor, -c, c) - abs_gradient = gradient.abs() - scalar = abs_gradient.max() - - sign_gradient = gradient.sign() * scalar - rnd_sample = torch.empty_like(tensor).uniform_(0, 1) * scalar.item() - sign_gradient[rnd_sample >= abs_gradient] = 0 - new_sign = sign_gradient.sign() # -1, 0, 1 - - tensor_compressed = new_sign.char(), scalar.flatten() - return tensor_compressed, shape - - def decompress(self, tensor_compressed, ctx): - sign, scalar = tensor_compressed - shape = ctx - tensor_decompressed = sign.float() * scalar - return tensor_decompressed.view(shape) - - diff --git a/untitled folder/grace_dl/dist/compressor/threshold.py b/untitled folder/grace_dl/dist/compressor/threshold.py deleted file mode 100644 index b0ebac8..0000000 --- a/untitled folder/grace_dl/dist/compressor/threshold.py +++ /dev/null @@ -1,27 +0,0 @@ -import torch - -from grace_dl.dist import Compressor - - -class ThresholdCompressor(Compressor): - - def __init__(self, threshold): - super().__init__(tensors_size_are_same=False) - self.threshold = threshold - - def compress(self, tensor, name): - shape = tensor.size() - tensor = tensor.flatten() - - indices, = torch.where(tensor.abs() >= min(self.threshold, torch.max(tensor))) - values = tensor[indices] - ctx = shape - return [values, indices.int()], ctx - - def decompress(self, tensor_compressed, ctx): - shape = ctx - numel = shape.numel() - values, indices = tensor_compressed - tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) - tensor_decompressed.scatter_(0, indices.long(), values) - return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/dist/compressor/topk.py b/untitled folder/grace_dl/dist/compressor/topk.py deleted file mode 100644 index 6fd55fe..0000000 --- a/untitled folder/grace_dl/dist/compressor/topk.py +++ /dev/null @@ -1,69 +0,0 @@ -import torch -from torch.utils.dlpack import to_dlpack - -from grace_dl.dist import Compressor - - -def rdxtopk_cuda(tensor, k): - import rdxtopk - # Only applicable to cuda tensors. - device = tensor.device - tensor = tensor.cuda() - indices = torch.arange(tensor.numel(), device=tensor.device).int() - vals, indices = rdxtopk.topk(tensor, indices, k) - return vals.to(device), indices.to(device).long() - - -def cupy_topk(tensor, k): - import cupy # conda install -c conda-forge cupy=7.0.0=py37h0c141eb_2 - # Only applicable to cuda tensors. cupy.partition is way much faster than torch.topk for large cuda tensors. - # Warning!! This is not an exact topk implementation, when there are multiple elements equal to the threshold. - # For float32 gradients, this rarely happens. - device = tensor.device - tensor = tensor.cuda() - cupy_tensor = cupy.fromDlpack(to_dlpack(tensor)) - threshold = -cupy.partition(-cupy_tensor, k)[k] - indices, = torch.where(tensor >= threshold.item()) - indices = indices[:k] - values = tensor[indices] - return values.to(device), indices.to(device).long() - - -def sparsify(tensor, compress_ratio, kernel): - tensor = tensor.flatten() - k = max(1, int(tensor.numel() * compress_ratio)) - if kernel == 'torch': - _, indices = torch.topk(tensor.abs(), k, sorted=False,) - elif kernel =='cupy': - _, indices = cupy_topk(tensor.abs(), k) - elif kernel == 'rdxtopk_cuda': - _, indices = rdxtopk_cuda(tensor.abs(), k) - values = torch.gather(tensor, 0, indices) - return values, indices.int() - - -def desparsify(tensors, numel): - values, indices = tensors - tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) - tensor_decompressed.scatter_(0, indices.long(), values) - return tensor_decompressed - - -class TopKCompressor(Compressor): - - def __init__(self, compress_ratio, kernel='torch'): - super().__init__() - self.compress_ratio = compress_ratio - self.kernel = kernel - - def compress(self, tensor, name): - tensors = sparsify(tensor, self.compress_ratio, self.kernel) - ctx = tensor.size() - return tensors, ctx - - def decompress(self, tensors, ctx): - """Decompress by filling empty slots with zeros and reshape back using the original shape""" - shape = ctx - numel = shape.numel() - tensor_decompressed = desparsify(tensors, numel) - return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/dist/helper.py b/untitled folder/grace_dl/dist/helper.py deleted file mode 100644 index e70b980..0000000 --- a/untitled folder/grace_dl/dist/helper.py +++ /dev/null @@ -1,102 +0,0 @@ -def grace_from_params(params): - comp = params.get('compressor', 'none') - mem = params.get('memory', 'none') - comm = params.get('communicator', 'allreduce') - if comp == 'dgc': - from grace_dl.dist.compressor.dgc import DgcCompressor - compress_ratio = params.get('compress_ratio', 0.3) - compressor = DgcCompressor(compress_ratio) - elif comp == 'efsignsgd': - from grace_dl.dist.compressor.efsignsgd import EFSignSGDCompressor - lr = params.get('lr', 0.1) - compressor = EFSignSGDCompressor(lr) - elif comp == 'fp16': - from grace_dl.dist.compressor.fp16 import FP16Compressor - compressor = FP16Compressor() - elif comp == 'natural': - from grace_dl.dist.compressor.natural import NaturalCompressor - compressor = NaturalCompressor() - elif comp == 'natural_cuda': - from grace_dl.dist.compressor.natural import NaturalCompressor_CUDA - compressor = NaturalCompressor_CUDA() - elif comp == 'none': - from grace_dl.dist.compressor.none import NoneCompressor - compressor = NoneCompressor() - elif comp == 'onebit': - from grace_dl.dist.compressor.onebit import OneBitCompressor - compressor = OneBitCompressor() - elif comp == 'powersgd': - from grace_dl.dist.compressor.powersgd import PowerSGDCompressor - compressor = PowerSGDCompressor() - elif comp == 'qsgd': - from grace_dl.dist.compressor.qsgd import QSGDCompressor - quantum_num = params.get('quantum_num', 127) - bucket_size = params.get('bucket_size', 128) - compressor = QSGDCompressor(quantum_num, bucket_size) - elif comp == 'qsgd_cuda': - from grace_dl.dist.compressor.qsgd import QSGDCompressor_CUDA - quantum_num = params.get('quantum_num', 127) - bucket_size = params.get('bucket_size', 128) - compressor = QSGDCompressor_CUDA(quantum_num, bucket_size) - elif comp == 'randomk': - from grace_dl.dist.compressor.randomk import RandomKCompressor - compress_ratio = params.get('compress_ratio', 0.3) - compressor = RandomKCompressor(compress_ratio) - elif comp == 'signsgd': - from grace_dl.dist.compressor.signsgd import SignSGDCompressor - compressor = SignSGDCompressor() - elif comp == 'signum': - from grace_dl.dist.compressor.signum import SignumCompressor - momentum = params.get('momentum', 0.9) - compressor = SignumCompressor(momentum) - elif comp == 'terngrad': - from grace_dl.dist.compressor.terngrad import TernGradCompressor - compressor = TernGradCompressor() - elif comp == 'threshold': - from grace_dl.dist.compressor.threshold import ThresholdCompressor - threshold = params.get('threshold', 0.01) - compressor = ThresholdCompressor(threshold) - elif comp == 'topk': - from grace_dl.dist.compressor.topk import TopKCompressor - compress_ratio = params.get('compress_ratio', 0.3) - kernel = params.get('kernel', 'torch') - compressor = TopKCompressor(compress_ratio, kernel) - else: - raise NotImplementedError(comp) - - if mem == 'dgc': - from grace_dl.dist.memory.dgc import DgcMemory - momentum = params.get('momentum', 0.9) - gradient_clipping = params.get('gradient_clipping', False) - memory = DgcMemory(momentum, gradient_clipping, params['world_size']) - elif mem == 'none': - from grace_dl.dist.memory.none import NoneMemory - memory = NoneMemory() - elif mem == 'powersgd': - from grace_dl.dist.memory.powersgd import PowerSGDMemory - compress_rank = params.get('compress_rank', 1) - memory = PowerSGDMemory(compressor.q_memory, compress_rank) - elif mem == 'residual': - from grace_dl.dist.memory.residual import ResidualMemory - memory = ResidualMemory() - elif mem == 'efsignsgd': - from grace_dl.dist.memory.efsignsgd import EFSignSGDMemory - lr = params.get('lr', 0.1) - memory = EFSignSGDMemory(lr) - else: - raise NotImplementedError(mem) - - if comm == 'allreduce': - from grace_dl.dist.communicator.allreduce import Allreduce - return Allreduce(compressor, memory, params['world_size']) - elif comm == 'allgather': - from grace_dl.dist.communicator.allgather import Allgather - return Allgather(compressor, memory, params['world_size']) - elif comm == 'broadcast': - from grace_dl.dist.communicator.broadcast import Broadcast - return Broadcast(compressor, memory, params['world_size']) - elif comm == 'alltoall': - from grace_dl.dist.communicator.all_to_all import AllToAll - return AllToAll(compressor, memory, params['world_size']) - else: - raise NotImplementedError(comm) diff --git a/untitled folder/grace_dl/dist/memory/dgc.py b/untitled folder/grace_dl/dist/memory/dgc.py deleted file mode 100644 index 93cd6e6..0000000 --- a/untitled folder/grace_dl/dist/memory/dgc.py +++ /dev/null @@ -1,39 +0,0 @@ -import torch -from torch import distributed as dist - -from grace_dl.dist import Memory - - -class DgcMemory(Memory): - def __init__(self, momentum, gradient_clipping, world_size): - self.gradient_clipping = gradient_clipping - self.momentum = momentum - self.world_size = world_size - self.gradients = {} - self.residuals = {} - - def compensate(self, tensor, name): - """Update the tensor with the residuals.""" - if self.gradient_clipping: - tensor_squ_sum = torch.sum(tensor * tensor) - clipping_val = torch.sqrt(dist.all_reduce(tensor_squ_sum) / self.world_size) - tensor = tensor.clamp(-clipping_val, clipping_val) - if name in self.residuals: - self.residuals[name] = self.momentum * self.residuals[name] + tensor - else: - self.residuals[name] = tensor - if name in self.gradients: - self.gradients[name] += self.residuals[name] - tensor = self.gradients[name] - else: - self.gradients[name] = tensor - return tensor - - def update(self, tensor, name, compressor, tensor_compressed, ctx): - """Update the residuals.""" - shape, mask, _ = ctx - not_mask = ~mask.view(shape) - temp = self.residuals[name] * not_mask - self.residuals[name] = temp - temp = self.gradients[name] * not_mask - self.gradients[name] = temp diff --git a/untitled folder/grace_dl/dist/memory/efsignsgd.py b/untitled folder/grace_dl/dist/memory/efsignsgd.py deleted file mode 100644 index c681f0e..0000000 --- a/untitled folder/grace_dl/dist/memory/efsignsgd.py +++ /dev/null @@ -1,19 +0,0 @@ -from grace_dl.torch import Memory - - -class EFSignSGDMemory(Memory): - def __init__(self, lr): - self.residuals = {} - self.learning_rate = lr - - def compensate(self, tensor, name): - """Update the tensor with the residuals.""" - if name in self.residuals: - tensor = self.residuals[name] + self.learning_rate * tensor - return tensor - - def update(self, tensor, name, compressor, tensor_compressed, ctx): - """Update the residuals.""" - tensor_decompressed = compressor.decompress(tensor_compressed, ctx) - residual = tensor - tensor_decompressed - self.residuals[name] = residual diff --git a/untitled folder/grace_dl/dist/memory/none.py b/untitled folder/grace_dl/dist/memory/none.py deleted file mode 100644 index d603e99..0000000 --- a/untitled folder/grace_dl/dist/memory/none.py +++ /dev/null @@ -1,11 +0,0 @@ -from grace_dl.dist import Memory - - -class NoneMemory(Memory): - def compensate(self, tensor, name): - """Update the tensor with the residuals.""" - return tensor - - def update(self, tensor, name, compressor, tensor_compressed, ctx): - """Update the residuals.""" - pass diff --git a/untitled folder/grace_dl/dist/memory/powersgd.py b/untitled folder/grace_dl/dist/memory/powersgd.py deleted file mode 100644 index 96bde4e..0000000 --- a/untitled folder/grace_dl/dist/memory/powersgd.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch - -from grace_dl.dist import Memory - - -class PowerSGDMemory(Memory): - def __init__(self, q_memory, compress_rank=1): - self.compress_rank = compress_rank - self.q_memory = q_memory - self.residuals = {} - - def compensate(self, tensor, name): - """Update the tensor with the residuals.""" - if tensor.dim() == 1: - return tensor - - if name in self.q_memory: - tensor += self.residuals[name] - - shape = tensor.size() - n = shape[0] - m = 1 - for dim in shape[1:]: - m = m * dim - - r = min(n, m, self.compress_rank) - normal = torch.empty(m, r, dtype=tensor.dtype, layout=tensor.layout, device=tensor.device).normal_() - self.q_memory[name] = normal - - return tensor - - def update(self, tensor, name, compressor, tensor_compressed, ctx): - """Update the residuals.""" - if ctx is None: - return - - self.residuals[name] = tensor - compressor.decompress(tensor_compressed, ctx) diff --git a/untitled folder/grace_dl/dist/memory/residual.py b/untitled folder/grace_dl/dist/memory/residual.py deleted file mode 100644 index e00a161..0000000 --- a/untitled folder/grace_dl/dist/memory/residual.py +++ /dev/null @@ -1,20 +0,0 @@ -from grace_dl.dist import Memory - - -class ResidualMemory(Memory): - def __init__(self, beta=1.0, gamma=1.0): - self.residuals = {} - self.beta = beta - self.gamma = gamma - - def compensate(self, tensor, name): - """Update the tensor with the residuals.""" - if name in self.residuals: - tensor = self.beta * self.residuals[name] + self.gamma * tensor - return tensor - - def update(self, tensor, name, compressor, tensor_compressed, ctx): - """Update the residuals.""" - tensor_decompressed = compressor.decompress(tensor_compressed, ctx) - residual = tensor - tensor_decompressed - self.residuals[name] = residual diff --git a/untitled folder/grace_dl/tensorflow/.DS_Store b/untitled folder/grace_dl/tensorflow/.DS_Store deleted file mode 100644 index 354ac1b19a8cd736f5444b51fb428fc7b68e8ed2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHM%Wl&^6ur}i#%U?aqLtbeme@ujp+Oa5W74ok7{LNiuw$#WusxRTG^(OV-qYXU z2M`;!EcgWefju9I?=FLYxb z;5jb3ynEw?4xJ9@ByT8dIvh&IG&psC z-P`~1Y_-30CI9Fozm_grQti4=%{cUFc#J-boK2~61Tnaf;BkW#uH0N+-Aad|vR^jq zSjwx~{Jc=La?F5UD*89yf|`VX!79frDY=1Hs*4r%gR`4*jTSbQSFT_vDz@_-6)V{D zO8x%2P^#o}sZgCyK^b^~(-h|^DZ=ckCiN$_P|dAl;1AJaWw!tT diff --git a/untitled folder/grace_dl/tensorflow/__init__.py b/untitled folder/grace_dl/tensorflow/__init__.py deleted file mode 100644 index 5101fb8..0000000 --- a/untitled folder/grace_dl/tensorflow/__init__.py +++ /dev/null @@ -1,114 +0,0 @@ -from abc import ABC, abstractmethod - -import tensorflow as tf - - -class Memory(ABC): - """ - Interface for implementing a memory operator - """ - @staticmethod - def init_var(initial_value): - from collections import defaultdict - residuals_init = defaultdict(list) - for var in tf.trainable_variables(): - shape_str = ''.join(str(e) for e in var.get_shape().as_list()) - residuals_init[shape_str].append(tf.Variable(tf.zeros_like(var), trainable=False)) - - @abstractmethod - def compensate(self, tensor, name): - """ - Compensate the tensor with the residuals. - - :param name: a unique id for this tensor - :param tensor: the current state of the tensor - :return: tensor compensated with stored values - """ - raise NotImplemented("compensate was not implemented.") - - @abstractmethod - def update(self, tensor, name, compressor, tensor_compressed, ctx): - """ - Update the residuals. - - :param tensor: the original tensor - :param name: a unique id for this tensor - :param compressor: the compressor used to compress `tensor` - :param tensor_compressed: a collection of tensors result of `compressor.compress` - :param ctx: the context of compression - :return: a collection of memory operations to be executed - """ - raise NotImplemented("update was not implemented.") - - -class Compressor(ABC): - """ - Interface for compressing and decompressing a given tensor. - """ - - def __init__(self, average=True, tensors_size_are_same=True): - self.average = average - self.tensors_size_are_same = tensors_size_are_same - - @abstractmethod - def compress(self, tensor, name): - """ - Compresses a tensor and returns a collection of compressed tensors with the context needed to decompress it. - - :param tensor: a tensor to compress - :return: a collection of tensors and the context to decompress them - """ - raise NotImplemented("compress was not implemented.") - - @abstractmethod - def decompress(self, tensors, ctx): - """ - Decompress a collection of compressed tensors with the given context. - - :param tensors: a collection of tensors - :param ctx: context of the compression - """ - raise NotImplemented("decompress was not implemented.") - - def aggregate(self, tensors): - """ - Aggregate a collection of tensors. - - :param tensors: collection of tensors - :return: aggregated tensor - """ - return tf.math.add_n(tensors) - - -class Communicator(ABC): - @abstractmethod - def send_receive(self, tensors, ctx): - """ - This method should use the collective communication, aggregation, and decompression methods. - - :param tensors: list of tensors to be sent - :param ctx: compression context of the tensors - :return: a decompressed tensor, result of aggregating the tensors of each worker - """ - raise NotImplemented("send was not implemented.") - - def __init__(self, compressor, memory): - """ - :param compressor: provides compression and decompression operators - :param memory: provides compensation and update functions - """ - self.compressor = compressor - self.memory = memory - - def step(self, tensor, name): - """ - Compensate, compress, update, communicate, decompress, and aggregate tensors of all the workers - - :param tensor: the tensor to be communicated across workers - :return: the tensor after - """ - tensor = self.memory.compensate(tensor, name) - tensors_compressed, ctx = self.compressor.compress(tensor, name) - update_ops = self.memory.update(tensor, name, self.compressor, tensors_compressed, ctx) - with tf.control_dependencies(update_ops): - return self.send_receive(tensors_compressed, ctx) diff --git a/untitled folder/grace_dl/tensorflow/communicator/.DS_Store b/untitled folder/grace_dl/tensorflow/communicator/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 1.25 * tf.math.ceil(elemnum * compress_ratio), lambda: 1.25 * thr, - lambda: 0.9 * thr) - mask = tf.math.greater_equal(tf.math.abs(tensor_masked), thr) - selected = tf.math.reduce_sum(tf.cast(mask, dtype=tf.float32)) - return thr, mask, selected - - def condition(thr, mask, selected): - cond_a = selected > 1.25 * tf.math.ceil(elemnum * compress_ratio) - cond_b = selected < 0.8 * tf.math.ceil(elemnum * compress_ratio) - return tf.math.logical_or(cond_a, cond_b) - - thr2, mask2, selected2 = tf.while_loop(condition, body, (thr, mask, selected), maximum_iterations=20) - thr2 = tf.cond(selected2 < 1, lambda: 0.8 * thr2, lambda: thr2) - mask2 = tf.math.greater(tf.math.abs(tensor_masked), thr2) - - indices = tf.reshape(tf.where(mask2), [-1]) - tensor_value2 = tf.boolean_mask(tensor_masked, mask2) - mean = tf.reshape(tf.math.reduce_mean(tensor_value2), [-1]) - - return indices, mean, mask2 - - tensor_shape = tf.shape(tensor) - tensor_size = tf.size(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - plus_mask = tf.math.greater(tensor_flatten, 0) - minus_mask = tf.math.less(tensor_flatten, 0) - plus_indices, plus_mean, plus_mask = quan(tensor_flatten, plus_mask, self.compress_ratio) - minus_indices, minus_mean, minus_mask = quan(tensor_flatten, minus_mask, self.compress_ratio) - - plus_mean = tf.bitcast(plus_mean, tf.int32) - plus_indices = tf.reshape(tf.cast(plus_indices, dtype=tf.int32), [-1]) - minus_mean = tf.bitcast(minus_mean, tf.int32) - minus_indices = tf.reshape(tf.cast(minus_indices, dtype=tf.int32), [-1]) - plus_indices_size = tf.reshape(tf.size(plus_indices), [-1]) - tensor_compressed = tf.concat([plus_mean, minus_mean, plus_indices_size, plus_indices, minus_indices], 0) - ctx = tensor_shape, tensor_size - return [tensor_compressed], ctx - - def decompress(self, tensors_compressed, ctx): - tensor_compressed, = tensors_compressed - plus_mean = tensor_compressed[0] - minus_mean = tensor_compressed[1] - plus_indices_size = tensor_compressed[2] - plus_indices = tensor_compressed[3:3 + plus_indices_size] - minus_indices = tensor_compressed[3 + plus_indices_size:] - - plus_mean = tf.bitcast(plus_mean, tf.float32) - minus_mean = tf.bitcast(minus_mean, tf.float32) - tensor_shape, tensor_size = ctx - - plus_mean = tf.ones(tf.shape(plus_indices), dtype=tf.float32) * plus_mean - minus_mean = tf.ones(tf.shape(minus_indices), dtype=tf.float32) * minus_mean - indices = tf.expand_dims(plus_indices, 1) - plus_means = tf.scatter_nd(indices, plus_mean, [tensor_size]) - indices = tf.expand_dims(minus_indices, 1) - tensor_decompressed = tf.tensor_scatter_nd_update(plus_means, indices, minus_mean) - tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/dgc.py b/untitled folder/grace_dl/tensorflow/compressor/dgc.py deleted file mode 100644 index af81765..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/dgc.py +++ /dev/null @@ -1,68 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class DgcCompressor(Compressor): - """ - (2017). Deep Gradient Compression: Reducing the Communication Bandwidth for Distributed Training. - Retrieved from http://arxiv.org/abs/1712.01887 - """ - def __init__(self, compress_ratio): - super().__init__(tensors_size_are_same=False) - self.compress_ratio = compress_ratio - - def compress(self, tensor, name): - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - elemnum = tensor_flatten.get_shape().as_list()[0] - - sample_shape = tf.reshape(tf.convert_to_tensor(max(1, int(elemnum * 0.01)), dtype=tf.int32), [-1]) - sample_index = tf.random.uniform(sample_shape, minval=0, maxval=elemnum, dtype=tf.int32) - sample_tensor = tf.gather(tensor_flatten, sample_index) - - k = max(1, int(elemnum * self.compress_ratio * 0.01)) - vals, indices = tf.math.top_k(tf.math.abs(sample_tensor), k) - thr = tf.math.reduce_min(vals) - mask = tf.math.greater(tf.math.abs(tensor_flatten), thr) - - selected = tf.math.reduce_sum(tf.cast(mask, dtype=tf.float32)) - - def body(thr, mask, selected): - thr = tf.cond(selected > 1.25 * max(1, int(elemnum * self.compress_ratio)), lambda: 1.25 * thr, - lambda: 0.9 * thr) - mask = tf.math.greater(tf.math.abs(tensor_flatten), thr) - selected = tf.math.reduce_sum(tf.cast(mask, dtype=tf.float32)) - return thr, mask, selected - - def condition(thr, mask, selected): - cond_a = selected > 1.25 * max(1, int(elemnum * self.compress_ratio)) - cond_b = selected < 0.8 * max(1, int(elemnum * self.compress_ratio)) - return tf.math.logical_or(cond_a, cond_b) - - thr, mask, new_selected = tf.while_loop(condition, body, (thr, mask, selected), maximum_iterations=20) - - thr = tf.cond(new_selected < 1, lambda: 0.8 * thr, lambda: thr) - # mask = tf.math.greater_equal(tf.math.abs(tensor_flatten), thr) - mask = tf.math.greater(tf.math.abs(tensor_flatten), thr) # fix the dgc NCF data volume issue - - indices = tf.reshape(tf.where(mask), [-1]) - values = tf.gather(tensor_flatten, indices) - - values = tf.bitcast(values, tf.int32) - indices = tf.cast(indices, dtype=tf.int32) - tensor_compressed = tf.concat([values, indices], 0) - ctx = tensor_shape, mask - - return [tensor_compressed], ctx - - def decompress(self, tensors_compressed, ctx): - tensor_compressed, = tensors_compressed - tensor_shape, _ = ctx - values, indices = tf.split(tensor_compressed, 2) - values = tf.bitcast(values, tf.float32) - tensor_size = tf.math.reduce_prod(tensor_shape) - indices = tf.expand_dims(indices, 1) - tensor_decompressed = tf.scatter_nd(indices, values, [tensor_size]) - tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/efsignsgd.py b/untitled folder/grace_dl/tensorflow/compressor/efsignsgd.py deleted file mode 100644 index f0b0ec2..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/efsignsgd.py +++ /dev/null @@ -1,40 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class EFSignSGDCompressor(Compressor): - """ - (2019). Error Feedback Fixes SignSGD and other Gradient Compression Schemes. - Retrieved from http://arxiv.org/abs/1901.09847 - """ - def __init__(self, lr): - super().__init__() - self.learning_rate = lr - - def aggregate(self, tensors): - """Aggregate a list of tensors.""" - agged_tensor = tf.math.add_n(tensors) - agged_tensor = agged_tensor / self.learning_rate - return agged_tensor - - def compress(self, tensor, name): - """Encoding and compressing the signs """ - - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - sign_encode = tf.math.greater_equal(tensor_flatten, 0) - mean = tf.math.reduce_mean(tf.abs(tensor_flatten)) - ctx = tensor_shape - tensor_compressed = mean, sign_encode - - return tensor_compressed, ctx - - def decompress(self, tensor_compressed, ctx): - """Decoding the signs to float format """ - mean, sign_encode = tensor_compressed - tensor_shape = ctx - sign_decode = tf.cast(sign_encode, dtype=tf.float32) * 2.0 - 1.0 - sign_decode = mean * sign_decode - tensor_decompressed = tf.reshape(sign_decode, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/fp16.py b/untitled folder/grace_dl/tensorflow/compressor/fp16.py deleted file mode 100644 index be246a4..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/fp16.py +++ /dev/null @@ -1,25 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class FP16Compressor(Compressor): - """Compress all floating point gradients to 16-bit.""" - - def compress(self, tensor, name): - """Downcasts the tensor to 16-bit.""" - tensor_compressed = tensor - - if tensor.dtype.is_floating: - # Only allow compression from other floating point types - tensor_compressed = tf.cast(tensor, dtype=tf.float16) - return [tensor_compressed], tensor.dtype - - def decompress(self, tensor, ctx): - """Upcasts the tensor to the initialization dtype.""" - - tensor_decompressed, = tensor - dtype = ctx - if dtype.is_floating: - tensor_decompressed = tf.cast(tensor, dtype=dtype) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/inceptionn.py b/untitled folder/grace_dl/tensorflow/compressor/inceptionn.py deleted file mode 100644 index b973598..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/inceptionn.py +++ /dev/null @@ -1,188 +0,0 @@ -import math - -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class INCEPTIONNCompressor(Compressor): - """ - (2018). A network-centric hardware/algorithm co-design to accelerate distributed training of deep neural networks. - https://doi.org/10.1109/MICRO.2018.00023 - """ - def __init__(self, error_bound): - super().__init__(tensors_size_are_same=False) - self.error_bound = error_bound - - def compress(self, tensor, name): - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - tensor_cast = tf.bitcast(tensor_flatten, tf.uint32) - sign = tf.bitwise.bitwise_and(tensor_cast, 0b10000000000000000000000000000000) - exp = tf.bitwise.bitwise_and(tensor_cast, 0b01111111100000000000000000000000) - mantissa = tf.bitwise.bitwise_and(tensor_cast, 0b00000000011111111111111111111111) - - exp = tf.bitwise.right_shift(exp, 23) - - error_bound_val = self.error_bound - error_bound = 127 + int(math.log(error_bound_val / 2, 10)) # error_bound exponent: 117 for 2e-10 - radius = math.ceil((127 - error_bound) / 2) - mid = error_bound + radius - mask_32bit = exp >= 127 - mask_16bit = (exp >= mid) & (exp < 127) - mask_8bit = (exp >= error_bound) & (exp < mid) - indices_32bit = tf.reshape(tf.where(mask_32bit), [-1]) - indices_16bit = tf.reshape(tf.where(mask_16bit), [-1]) - indices_8bit = tf.reshape(tf.where(mask_8bit), [-1]) - - # no compress - v_32bit = tf.gather(tensor_flatten, indices_32bit) - - # 16bit compress - s_16bit = tf.gather(sign, indices_16bit) - e_16bit = tf.gather(exp, indices_16bit) - m_16bit = tf.gather(mantissa, indices_16bit) - n_shift = 127 - tf.cast(e_16bit, dtype=tf.int32) - n_shift = tf.cast(n_shift, tf.uint32) - shifted_s = tf.bitwise.right_shift(s_16bit, 8) - marker = 0b00000000010000000000000000000000 - m_16bit_concat = tf.bitwise.bitwise_or(tf.bitwise.right_shift(m_16bit, 1), marker) - shifted_m = tf.bitwise.right_shift(m_16bit_concat, n_shift) - temp = tf.bitwise.bitwise_or(shifted_s, shifted_m) - v_16bit = tf.cast(tf.bitwise.right_shift(temp, 8), dtype=tf.uint16) - - # 8bit compress - s_8bit = tf.gather(sign, indices_8bit) - e_8bit = tf.gather(exp, indices_8bit) - m_8bit = tf.gather(mantissa, indices_8bit) - n_shift = 127 - tf.cast(e_8bit, dtype=tf.int32) - n_shift = tf.cast(n_shift, tf.uint32) - shifted_s = tf.bitwise.right_shift(s_8bit, 8) - marker = 0b00000000010000000000000000000000 - m_8bit_concat = tf.bitwise.bitwise_or(tf.bitwise.right_shift(m_8bit, 1), marker) - shifted_m = tf.bitwise.right_shift(m_8bit_concat, n_shift) - temp = tf.bitwise.bitwise_or(shifted_s, shifted_m) - v_8bit = tf.cast(tf.bitwise.right_shift(temp, 16), dtype=tf.uint8) - - def encode_byte(a): - # input: int32 type tensor with values in range 0,1,2,3 (2'b00,2'b01,2'b10,3'b11) - # output: encoded uint8 type tensor - a = tf.reshape(a, [-1]) - pad_size = 4 - tf.mod(tf.size(a), 4) - pad = tf.range(0, pad_size) - a = tf.concat([a, pad], 0) - a_split1, a_split2, a_split3, a_split4 = tf.split(a, 4) - - # encode 4 grads into 1 Byte - sum_1 = tf.add(a_split1, a_split2 * 4) - sum_2 = tf.add(a_split3 * 16, a_split4 * 64) - sum_all = tf.add(sum_1, sum_2) - return tf.cast(sum_all, tf.uint8) - - # encode indices - mask_encode = 0 - for mask, code in zip([mask_8bit, mask_16bit, mask_32bit], [1, 2, 3]): - mask_encode += tf.cast(mask, tf.int32) * code - mask_encode = encode_byte(mask_encode) - v_32bit = tf.reshape(v_32bit, [-1]) - v_16bit = tf.reshape(v_16bit, [-1]) - v_8bit = tf.reshape(v_8bit, [-1]) - mask_encode = tf.reshape(mask_encode, [-1]) - tensor_compressed = v_32bit, v_16bit, v_8bit, mask_encode - ctx = tensor_shape - - return tensor_compressed, ctx - - # decompress - def decompress(self, tensor_compressed, ctx): - - def decode_byte(encoded, real_size): - # input: encoded uint8 type tensor - # output: int32 type tensor with values in range 0,1,2,3 (2'b00,2'b01,2'b10,3'b11) - a = tf.cast(encoded, tf.int32) - a_split1 = tf.mod(a, 4) - a_split2 = tf.cast(tf.mod(a / 4, 4), tf.int32) - a_split3 = tf.cast(tf.mod(a / 16, 4), tf.int32) - a_split4 = tf.cast(tf.mod(a / 64, 4), tf.int32) - a = tf.concat([a_split1, a_split2, a_split3, a_split4], 0) - a = a[:real_size] - return a - - v_32bit, v_16bit, v_8bit, mask_encode = tensor_compressed - tensor_shape = ctx - tensor_size = tf.math.reduce_prod(tensor_shape) - - # decode mask and gather indices - mask_decode = decode_byte(mask_encode, tensor_size) - mask_32bit = tf.equal(mask_decode, 3) - mask_16bit = tf.equal(mask_decode, 2) - mask_8bit = tf.equal(mask_decode, 1) - indices_32bit = tf.reshape(tf.where(mask_32bit), [-1]) - indices_16bit = tf.reshape(tf.where(mask_16bit), [-1]) - indices_8bit = tf.reshape(tf.where(mask_8bit), [-1]) - - edges_16bit = [0, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536] - edges_8bit = [0, 2, 4, 8, 16, 32, 64, 128, 256] - import tensorflow_probability as tfp - - # 16bit decompress - # get the sign bit s_16bit and remove MSB from v_16bit - s_16bit = tf.bitwise.bitwise_and(v_16bit, 0b1000000000000000) - s_16bit = tf.cast(s_16bit, dtype=tf.int32) - s_16bit = tf.bitwise.left_shift(s_16bit, 16) - v_16bit = tf.bitwise.left_shift(v_16bit, 1) - - # 8bit decompress - # get the sign bit s_8bit and remove MSB from v_8bit - s_8bit = tf.bitwise.bitwise_and(v_8bit, 0b10000000) - s_8bit = tf.cast(s_8bit, dtype=tf.int32) - s_8bit = tf.bitwise.left_shift(s_8bit, 24) - v_8bit = tf.bitwise.left_shift(v_8bit, 1) - - # find the marker bit in v_16bit and get the exponent - zero_tensor = tf.Variable(tf.zeros([tensor_size], dtype=tf.int32), trainable=False) - op = zero_tensor.assign(tf.zeros([tensor_size], dtype=tf.int32)) - with tf.control_dependencies([op]): - temp = tf.scatter_update(zero_tensor, indices_16bit, tf.cast(v_16bit, tf.int32)) - temp = tf.scatter_update(temp, indices_8bit, tf.cast(v_8bit, tf.int32)) - n_shift_all = tfp.stats.find_bins(tf.cast(temp, dtype=tf.int32), edges_16bit) - - n_shift = 16 - tf.gather(n_shift_all, indices_16bit) - e_16bit = 127 - (n_shift - 1) - e_16bit = tf.bitwise.left_shift(e_16bit, 23) - - # restore the mantissa - n_shift = tf.cast(n_shift, dtype=tf.uint16) - v_16bit = tf.bitwise.left_shift(v_16bit, n_shift) - v_16bit = tf.cast(v_16bit, dtype=tf.int32) - m_16bit = tf.bitwise.left_shift(v_16bit, 7) - - # concat all - temp = tf.bitwise.bitwise_or(s_16bit, e_16bit) - v_16bit = tf.bitwise.bitwise_or(temp, m_16bit) - v_16bit = tf.bitcast(v_16bit, tf.float32) - - # find the marker bit in v_8bit and get the exponent - n_shift = 8 - tf.gather(n_shift_all, indices_8bit) - e_8bit = 127 - (n_shift - 1) - e_8bit = tf.bitwise.left_shift(e_8bit, 23) - - # restore the mantissa - n_shift = tf.cast(n_shift, dtype=tf.uint8) - v_8bit = tf.bitwise.left_shift(v_8bit, n_shift) - v_8bit = tf.cast(v_8bit, dtype=tf.int32) - m_8bit = tf.bitwise.left_shift(v_8bit, 15) - - # concat all - temp = tf.bitwise.bitwise_or(s_8bit, e_8bit) - v_8bit = tf.bitwise.bitwise_or(temp, m_8bit) - v_8bit = tf.bitcast(v_8bit, tf.float32) - - zero_tensor = tf.Variable(tf.zeros([tensor_size], dtype=tf.float32), trainable=False) - op = zero_tensor.assign(tf.zeros([tensor_size], dtype=tf.float32)) - with tf.control_dependencies([op]): - temp = tf.scatter_update(zero_tensor, indices_32bit, v_32bit) - temp = tf.scatter_update(temp, indices_16bit, v_16bit) - temp = tf.scatter_update(temp, indices_8bit, v_8bit) - tensor_decompressed = tf.reshape(temp, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/natural.py b/untitled folder/grace_dl/tensorflow/compressor/natural.py deleted file mode 100644 index 262be68..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/natural.py +++ /dev/null @@ -1,42 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class NaturalCompressor(Compressor): - """ - (2019). Natural compression for distributed deep learning. - Retrieved from https://arxiv.org/abs/1905.10988 - """ - def compress(self, tensor, name): - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - - tensor_cast = tf.bitcast(tensor_flatten, tf.int32) - sign = tf.bitwise.bitwise_and(tensor_cast, 0b10000000000000000000000000000000) - exp = tf.bitwise.bitwise_and(tensor_cast, 0b01111111100000000000000000000000) - mantissa = tf.bitwise.bitwise_and(tensor_cast, 0b00000000011111111111111111111111) - exp_add_one = mantissa > tf.random.uniform(tf.shape(tensor_flatten), minval=0, maxval=0x007fffff, - dtype=tf.int32) - # exp_add_one = mantissa > 0x00400000 # deterministic - exponent = tf.where(exp_add_one, exp + 0b00000000100000000000000000000000, exp) - # original exponent range: -128 ~ 127, clip to -110, 17 - # view as uint8_t: 0 ~ 255 18 145 - exp_shift = tf.clip_by_value(exponent, 0b00001001000000000000000000000000, 0b01001000100000000000000000000000) - exps = tf.bitwise.right_shift(exp_shift, 23) - # shift 18 so that 0 maps to -110 and 127 maps to 145 - # set MSB if negative - exps = tf.bitwise.bitwise_or(tf.bitwise.right_shift(sign, 24), exps - 18) - tensor_compressed = tf.cast(exps, tf.uint8) - - return [tensor_compressed], tensor_shape - - def decompress(self, tensors_compressed, tensor_shape): - tensor_compressed, = tensors_compressed - sign = tensor_compressed > 127 - exps = tf.bitwise.bitwise_and(tensor_compressed, 0b01111111) - exps_shift = tf.bitwise.left_shift(tf.cast(exps + 18, tf.int32), 23) - floats = tf.bitcast(exps_shift, tf.float32) - tensor_decompressed = tf.where(sign, -floats, floats) - tensor_decompressed = tf.multiply(tf.cast(exps >= 1, tensor_decompressed.dtype), tensor_decompressed) - return tf.reshape(tensor_decompressed, tensor_shape) diff --git a/untitled folder/grace_dl/tensorflow/compressor/none.py b/untitled folder/grace_dl/tensorflow/compressor/none.py deleted file mode 100644 index 0fa908c..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/none.py +++ /dev/null @@ -1,14 +0,0 @@ -from grace_dl.tensorflow import Compressor - - -class NoneCompressor(Compressor): - """Default no-op compression.""" - - def compress(self, tensor, name): - """Returns the tensor unmodified.""" - return [tensor], None - - def decompress(self, tensors, ctx): - """Returns the tensor unmodified.""" - tensor, = tensors - return tensor diff --git a/untitled folder/grace_dl/tensorflow/compressor/onebit.py b/untitled folder/grace_dl/tensorflow/compressor/onebit.py deleted file mode 100644 index f97733e..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/onebit.py +++ /dev/null @@ -1,42 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class OneBitCompressor(Compressor): - """ - (2014). 1-bit stochastic gradient descent and its application to data-parallel distributed training of speech DNNs. - Retrieved from https://www.microsoft.com/en-us/research/publication/1-bit-stochastic-gradient-descent-and-application-to-data-parallel-distributed-training-of-speech-dnns/ - """ - def compress(self, tensor, name): - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - - mask0 = tf.math.less(tensor_flatten, 0) - sum0 = tf.math.reduce_sum(tf.boolean_mask(tensor_flatten, mask0)) - num0 = tf.math.reduce_sum(tf.cast(mask0, dtype=tf.float32)) - num0 = tf.where(tf.math.greater(num0, 0), num0, 1.0) - mean0 = sum0 / num0 - - mask1 = tf.math.logical_not(mask0) - sum1 = tf.math.reduce_sum(tf.boolean_mask(tensor_flatten, mask1)) - num1 = tf.math.reduce_sum(tf.cast(mask1, dtype=tf.float32)) - num1 = tf.where(tf.math.greater(num1, 0), num1, 1.0) - mean1 = sum1 / num1 - - mean0 = tf.reshape(mean0, [-1]) - mean1 = tf.reshape(mean1, [-1]) - mean = tf.concat([mean0, mean1], 0) - ctx = tensor_shape - tensor_compressed = mask0, mean - - return tensor_compressed, ctx - - def decompress(self, tensor_compressed, ctx): - tensor_shape = ctx - mask0, mean = tensor_compressed - mean0, mean1 = tf.split(mean, 2) - mask0 = tf.cast(mask0, dtype=tf.float32) - tensor_decompressed = mask0 * mean0 + (1 - mask0) * mean1 - tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/packing.py b/untitled folder/grace_dl/tensorflow/compressor/packing.py deleted file mode 100644 index 91ecd34..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/packing.py +++ /dev/null @@ -1,30 +0,0 @@ -import tensorflow as tf - - -def encode_byte(a): - # input: int32 type tensor with values in range 0,1,2,3 (2'b00,2'b01,2'b10,3'b11) - # output: encoded uint8 type tensor - a = tf.reshape(a, [-1]) - pad_size = 4 - tf.mod(tf.size(a), 4) - pad = tf.range(0, pad_size) - a = tf.concat([a, pad], 0) - a_split1, a_split2, a_split3, a_split4 = tf.split(a, 4) - - # encode 4 grads into 1 Byte - sum_1 = tf.add(a_split1, a_split2 * 4) - sum_2 = tf.add(a_split3 * 16, a_split4 * 64) - sum_all = tf.add(sum_1, sum_2) - return tf.cast(sum_all, tf.uint8) - - -def decode_byte(encoded, real_size): - # input: encoded uint8 type tensor - # output: int32 type tensor with values in range 0,1,2,3 (2'b00,2'b01,2'b10,3'b11) - a = tf.cast(encoded, tf.int32) - a_split1 = tf.mod(a, 4) - a_split2 = tf.cast(tf.mod(a / 4, 4), tf.int32) - a_split3 = tf.cast(tf.mod(a / 16, 4), tf.int32) - a_split4 = tf.cast(tf.mod(a / 64, 4), tf.int32) - a = tf.concat([a_split1, a_split2, a_split3, a_split4], 0) - a = a[:real_size] - return a \ No newline at end of file diff --git a/untitled folder/grace_dl/tensorflow/compressor/powersgd.py b/untitled folder/grace_dl/tensorflow/compressor/powersgd.py deleted file mode 100644 index 707b2a2..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/powersgd.py +++ /dev/null @@ -1,58 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor -from horovod.tensorflow import _allreduce - - -class PowerSGDCompressor(Compressor): - """ - PowerSGD: Practical Low-Rank Gradient Compression for Distributed Optimization. - T. Vogels, S. P. Karimireddy, and M. Jaggi. In NeurIPS, 2019. - """ - - def __init__(self, momentum_factor, world_size): - super().__init__() - self.q_memory = {} - self.world_size = world_size - self.momentum = {} - self.momentum_factor = momentum_factor - for v in tf.trainable_variables(): - self.momentum[v.name] = tf.Variable(tf.zeros_like(v), trainable=False) - - def compress(self, tensor, name): - tensor_dims = len(tensor.get_shape().as_list()) - if tensor_dims == 1: - return [tensor], None - - tensor_shape = tf.shape(tensor) - matrix = tf.reshape(tensor, [tensor_shape[0], -1]) - - q = self.q_memory[name] - q, _ = tf.linalg.qr(q) - - p = tf.linalg.matmul(matrix, q) - p = _allreduce(p) / self.world_size - p, _ = tf.linalg.qr(p) - q = tf.linalg.matmul(matrix, p, transpose_a=True) - q = _allreduce(q) / self.world_size - new_q = self.q_memory[name].assign(q) - ctx = p, new_q, tensor_shape, name - # variable initialization needs to be called before communication starts - # self.momentum[tensor.name] = tf.Variable(tf.zeros_like(tensor), trainable=False) - - return [], ctx - - def decompress(self, tensors, ctx): - if ctx is None: - tensor, = tensors - return tensor - - p, q, tensor_shape, tensor_name = ctx - new_tensor = tf.linalg.matmul(p, q, transpose_b=True) - new_tensor = tf.reshape(new_tensor, tensor_shape) - - new_momentum = self.momentum[tensor_name].assign(self.momentum_factor * self.momentum[tensor_name] + - (1 - self.momentum_factor) * new_tensor) - tensor_decompressed = new_momentum + new_tensor - - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/qsgd.py b/untitled folder/grace_dl/tensorflow/compressor/qsgd.py deleted file mode 100644 index 608153e..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/qsgd.py +++ /dev/null @@ -1,44 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class QSGDCompressor(Compressor): - """ - (2016). QSGD: Communication-Efficient SGD via Gradient Quantization and Encoding. - Retrieved from http://arxiv.org/abs/1610.02132 - """ - def __init__(self, quantum_num): - super().__init__() - self.quantum_num = quantum_num - - def compress(self, tensor, name): - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - norm = tf.reshape(tf.norm(tensor_flatten), [-1]) - abs_gradient = tf.abs(tensor_flatten) - qnum = tf.cast(self.quantum_num, dtype=tf.float32) - - level_float = qnum / norm * abs_gradient - previous_level = tf.math.floor(level_float) - prob = tf.random.uniform(tf.shape(tensor_flatten)) - is_next_level = tf.cast(tf.math.less(prob, (level_float - previous_level)), tf.float32) - new_level = tf.cast(previous_level + is_next_level, tf.float32) - - sign = tf.sign(tensor_flatten) - tensor_compressed = new_level * sign - tensor_compressed = tf.cast(tensor_compressed, dtype=tf.int8 if self.quantum_num < 128 else tf.int16) - tensor_compressed = tensor_compressed, norm - ctx = tensor_shape - - return tensor_compressed, ctx - - def decompress(self, tensor_compressed, ctx): - qnum = tf.cast(self.quantum_num, dtype=tf.float32) - tensor_shape = ctx - tensor_compressed, norm = tensor_compressed - - decode_output = tf.cast(tensor_compressed, dtype=tf.float32) - tensor_decompressed = norm / qnum * decode_output - tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/randomk.py b/untitled folder/grace_dl/tensorflow/compressor/randomk.py deleted file mode 100644 index 983c3cf..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/randomk.py +++ /dev/null @@ -1,53 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -def sparsify(tensor, compress_ratio): - elemnum = tensor.get_shape().as_list()[0] - k = max(1, int(elemnum * compress_ratio)) - _, indices = tf.math.top_k(tf.math.abs(tensor), k) - values = tf.gather(tensor, indices) - return indices, values - - -def desparsify(indices, values, tensor_size): - indices = tf.expand_dims(indices, 1) - tensor = tf.scatter_nd(indices, values, [tensor_size]) - return tensor - - -class RandomKCompressor(Compressor): - global_step = 0 - - def __init__(self, compress_ratio): - super().__init__() - self.compress_ratio = compress_ratio - - def compress(self, tensor, name): - """Use Python Random libraries RNG to compress by generating a list of indices to be transmitted.""" - - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - elemnum = tensor_flatten.get_shape().as_list()[0] - # all_indices = tf.range(elemnum, dtype=tf.int32) - k = max(1, int(elemnum * self.compress_ratio)) - seed = (sum([ord(s) for s in tensor.name]), tf.train.get_global_step()) - tf.compat.v1.set_random_seed(1) - indices = tf.random.stateless_uniform( - [k], seed, minval=0, maxval=elemnum + 1, dtype=tf.int32, name=None - ) - values = tf.gather(tensor_flatten, indices) - ctx = indices, tensor_shape - tensor_compressed = values - - return [tensor_compressed], ctx - - def decompress(self, tensor_compressed, ctx): - """Decompress by filling empty slots with zeros and reshape back using the original shape""" - indices, tensor_shape = ctx - values, = tensor_compressed - tensor_size = tf.math.reduce_prod(tensor_shape) - tensor_decompressed = desparsify(indices, values, tensor_size) - tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/signsgd.py b/untitled folder/grace_dl/tensorflow/compressor/signsgd.py deleted file mode 100644 index 7a1a0c7..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/signsgd.py +++ /dev/null @@ -1,30 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class SignSGDCompressor(Compressor): - """ - (2018). signSGD: Compressed Optimisation for Non-Convex Problems. - Retrieved from https://arxiv.org/abs/1802.04434 - """ - def aggregate(self, tensors): - """Aggregate a list of tensors.""" - agged_tensor = tf.math.add_n(tensors) - agged_tensor = tf.cast(tf.math.greater_equal(agged_tensor, 0), dtype=tf.float32) - agged_tensor = agged_tensor * 2.0 - 1.0 - return agged_tensor - - def compress(self, tensor, name): - """Encoding and compressing the signs """ - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - tensor_compressed = tf.math.greater_equal(tensor_flatten, 0) - return [tensor_compressed], tensor_shape - - def decompress(self, tensors, tensor_shape): - """Decoding the signs to float format """ - sign_encode, = tensors - sign_decode = tf.cast(sign_encode, dtype=tf.float32) * 2.0 - 1.0 - tensor_decompressed = tf.reshape(sign_decode, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/signum.py b/untitled folder/grace_dl/tensorflow/compressor/signum.py deleted file mode 100644 index da0ab48..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/signum.py +++ /dev/null @@ -1,45 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class SignumCompressor(Compressor): - """ - (2018). signSGD: Compressed Optimisation for Non-Convex Problems. - Retrieved from https://arxiv.org/abs/1802.04434 - """ - - def __init__(self, momentum): - super().__init__() - self.momentum_factor = momentum - self.momentum = {} - for v in tf.trainable_variables(): - self.momentum[v.name] = tf.Variable(tf.zeros_like(v), trainable=False) - - def aggregate(self, tensors): - """Aggregate a list of tensors.""" - agged_tensor = tf.math.add_n(tensors) - agged_tensor = tf.cast(tf.math.greater_equal(agged_tensor, 0), dtype=tf.float32) - agged_tensor = agged_tensor * 2.0 - 1.0 - return agged_tensor - - def compress(self, tensor, name): - """Encoding and compressing the signs """ - - # update tensor by momentum - tensor = (1.0 - self.momentum_factor) * tensor + self.momentum_factor * self.momentum[name] - op = self.momentum[name].assign(tensor) - - with tf.control_dependencies([op]): - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - tensor_compressed = tf.math.greater_equal(tensor_flatten, 0) - - return [tensor_compressed], tensor_shape - - def decompress(self, tensors, tensor_shape): - """Decoding the signs to float format """ - sign_encode, = tensors - sign_decode = tf.cast(sign_encode, dtype=tf.float32) * 2.0 - 1.0 - tensor_decompressed = tf.reshape(sign_decode, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/sketch.py b/untitled folder/grace_dl/tensorflow/compressor/sketch.py deleted file mode 100644 index c7022b4..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/sketch.py +++ /dev/null @@ -1,39 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class SketchCompressor(Compressor): - """ - SketchML: Accelerating Distributed Machine Learning with Data Sketches. - J. Jiang, F. Fu, T. Yang, and B. Cui. SIGMOD, 2018. - """ - - def __init__(self, quantiles): - super().__init__() - self.quantiles = quantiles - - def compress(self, tensor, name): - import tensorflow_probability as tfp - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - - x = tensor_flatten - edges = tfp.stats.quantiles(x, num_quantiles=self.quantiles, interpolation='linear') - bins = tf.cast(tfp.stats.find_bins(x, edges), dtype=tf.int32) - means = tf.math.unsorted_segment_mean(x, bins, num_segments=self.quantiles) - - tensor_compressed = tf.cast(bins, dtype=tf.uint8 if self.quantiles < 256 else tf.uint16) - means = tf.reshape(means, [-1]) - ctx = tensor_shape - tensor_compressed = tensor_compressed, means - - return tensor_compressed, ctx - - def decompress(self, tensor_compressed, ctx): - tensor_shape = ctx - tensor_compressed, means = tensor_compressed - bins = tf.cast(tensor_compressed, dtype=tf.int32) - tensor_decompressed = tf.gather(means, bins) - tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/terngrad.py b/untitled folder/grace_dl/tensorflow/compressor/terngrad.py deleted file mode 100644 index b647030..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/terngrad.py +++ /dev/null @@ -1,42 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class TernGradCompressor(Compressor): - """ - (2017). TernGrad: Ternary Gradients to Reduce Communication in Distributed Deep Learning. - Retrieved from http://arxiv.org/abs/1705.07878 - """ - def compress(self, tensor, name): - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - - std = tf.math.square(tensor_flatten - tf.math.reduce_mean(tensor_flatten)) - std = tf.math.sqrt(tf.math.reduce_mean(std)) - c = 2.5 - gradient = tf.clip_by_value(tensor_flatten, -c * std, c * std) - scaler = tf.math.reduce_max(tf.math.abs(gradient)) - - zeros = tf.zeros(tf.shape(tensor_flatten)) - abs_gradient = tf.abs(gradient) - sign_gradient = tf.sign(gradient) - rnd_sample = tf.random.uniform(tf.shape(tensor_flatten), 0, scaler) - where_cond = tf.less(rnd_sample, abs_gradient) - binarized_gradient = tf.where(where_cond, sign_gradient * scaler, zeros) - new_sign = tf.sign(binarized_gradient) # -1, 0, 1 - - scaler = tf.reshape(scaler, [-1]) - ctx = tensor_shape - tensor_compressed = tf.cast(new_sign, tf.int8), scaler - - return tensor_compressed, ctx - - def decompress(self, tensor_compressed, ctx): - tensor_shape = ctx - tensor_compressed, scaler = tensor_compressed - - sign = tf.cast(tensor_compressed, dtype=tf.float32) - tensor_decompressed = sign * scaler - tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/threshold.py b/untitled folder/grace_dl/tensorflow/compressor/threshold.py deleted file mode 100644 index dd30042..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/threshold.py +++ /dev/null @@ -1,33 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class ThresholdCompressor(Compressor): - - def __init__(self, threshold): - super().__init__(tensors_size_are_same=False) - self.threshold = threshold - - def compress(self, tensor, name): - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - thr_mask = tf.math.greater(tf.math.abs(tensor_flatten), self.threshold) - values = tf.boolean_mask(tensor_flatten, thr_mask) - indices = tf.reshape(tf.where(thr_mask), [-1]) - ctx = tensor_shape - values = tf.bitcast(values, tf.int32) - indices = tf.cast(indices, dtype=tf.int32) - tensor_compressed = tf.concat([values, indices], 0) - return [tensor_compressed], ctx - - def decompress(self, tensors_compressed, ctx): - tensor_compressed, = tensors_compressed - values, indices = tf.split(tensor_compressed, 2) - values = tf.bitcast(values, tf.float32) - tensor_shape = ctx - tensor_size = tf.math.reduce_prod(tensor_shape) - indices = tf.expand_dims(indices, 1) - tensor_decompressed = tf.scatter_nd(indices, values, [tensor_size]) - tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/topk.py b/untitled folder/grace_dl/tensorflow/compressor/topk.py deleted file mode 100644 index 773cd77..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/topk.py +++ /dev/null @@ -1,49 +0,0 @@ -import tensorflow as tf - -from grace.grace_dl.tensorflow import Compressor - - -def sparsify(tensor, compress_ratio): - elemnum = tensor.get_shape().as_list()[0] - k = max(1, int(elemnum * compress_ratio)) - _, indices = tf.math.top_k(tf.math.abs(tensor), k, sorted=False) - values = tf.gather(tensor, indices) - return values, indices - - -def desparsify(indices, values, tensor_size): - indices = tf.expand_dims(indices, 1) - tensor = tf.scatter_nd(indices, values, [tensor_size]) - return tensor - - -class TopKCompressor(Compressor): - def __init__(self, compress_ratio): - super().__init__() - self.compress_ratio = compress_ratio - - def compress(self, tensor, name): - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - elemnum = tensor_flatten.get_shape().as_list()[0] - - values, indices = sparsify(tensor_flatten, self.compress_ratio) - - indices = tf.cast(indices, tf.int32) - values = tf.bitcast(values, tf.int32) - tensor_compressed = tf.concat([values, indices], 0) - ctx = tensor_shape, elemnum - return [tensor_compressed], ctx - - def decompress(self, tensors_compressed, ctx): - """Decompress by filling empty slots with zeros and reshape back using the original shape""" - tensor_compressed, = tensors_compressed - values, indices = tf.split(tensor_compressed, 2) - values = tf.bitcast(values, tf.float32) - tensor_shape, tensor_size = ctx - #tensor_size = tf.math.reduce_prod(tensor_shape) - - tensor_decompressed = desparsify(indices, values, tensor_size) - - tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/compressor/u8bit.py b/untitled folder/grace_dl/tensorflow/compressor/u8bit.py deleted file mode 100644 index 18b32b9..0000000 --- a/untitled folder/grace_dl/tensorflow/compressor/u8bit.py +++ /dev/null @@ -1,110 +0,0 @@ -import tensorflow as tf - -from grace_dl.tensorflow import Compressor - - -class U8bitCompressor(Compressor): - """ - (2015). 8-Bit Approximations for Parallelism in Deep Learning. - Retrieved from http://arxiv.org/abs/1511.04561 - """ - def compress(self, tensor, name): - dict128 = tf.constant( - [ - 1.5000001e-06, 2.7500000e-06, 7.2499997e-06, 1.8750001e-05, - 3.6250000e-05, 5.8749996e-05, 8.6249995e-05, 1.4375000e-04, - 2.3125000e-04, 3.1875001e-04, 4.0625001e-04, 5.1874999e-04, - 6.5624999e-04, 7.9374999e-04, 9.3124999e-04, 1.2187500e-03, - 1.6562500e-03, 2.0937501e-03, 2.5312500e-03, 2.9687500e-03, - 3.4062499e-03, 3.8437501e-03, 4.2812498e-03, 4.8437500e-03, - 5.5312500e-03, 6.2187500e-03, 6.9062500e-03, 7.5937500e-03, - 8.2812496e-03, 8.9687500e-03, 9.6562495e-03, 1.1093750e-02, - 1.3281250e-02, 1.5468750e-02, 1.7656250e-02, 1.9843750e-02, - 2.2031249e-02, 2.4218749e-02, 2.6406251e-02, 2.8593751e-02, - 3.0781250e-02, 3.2968748e-02, 3.5156250e-02, 3.7343752e-02, - 3.9531250e-02, 4.1718751e-02, 4.3906249e-02, 4.6718750e-02, - 5.0156251e-02, 5.3593751e-02, 5.7031251e-02, 6.0468748e-02, - 6.3906237e-02, 6.7343749e-02, 7.0781253e-02, 7.4218743e-02, - 7.7656247e-02, 8.1093743e-02, 8.4531240e-02, 8.7968737e-02, - 9.1406241e-02, 9.4843738e-02, 9.8281242e-02, 1.0546875e-01, - 1.1640625e-01, 1.2734374e-01, 1.3828126e-01, 1.4921875e-01, - 1.6015625e-01, 1.7109375e-01, 1.8203124e-01, 1.9296876e-01, - 2.0390625e-01, 2.1484375e-01, 2.2578125e-01, 2.3671874e-01, - 2.4765626e-01, 2.5859374e-01, 2.6953125e-01, 2.8046876e-01, - 2.9140624e-01, 3.0234376e-01, 3.1328124e-01, 3.2421875e-01, - 3.3515626e-01, 3.4609374e-01, 3.5703126e-01, 3.6796874e-01, - 3.7890625e-01, 3.8984376e-01, 4.0078124e-01, 4.1171876e-01, - 4.2265624e-01, 4.3359375e-01, 4.4453126e-01, 4.5859376e-01, - 4.7578123e-01, 4.9296874e-01, 5.1015621e-01, 5.2734375e-01, - 5.4453123e-01, 5.6171870e-01, 5.7890624e-01, 5.9609371e-01, - 6.1328125e-01, 6.3046873e-01, 6.4765620e-01, 6.6484374e-01, - 6.8203121e-01, 6.9921869e-01, 7.1640623e-01, 7.3359370e-01, - 7.5078118e-01, 7.6796871e-01, 7.8515619e-01, 8.0234367e-01, - 8.1953120e-01, 8.3671868e-01, 8.5390615e-01, 8.7109369e-01, - 8.8828117e-01, 9.0546864e-01, 9.2265618e-01, 9.3984365e-01, - 9.5703113e-01, 9.7421867e-01, 9.9140614e-01, 9.9570298e-01, - ], dtype=tf.float32 - ) - - tensor_shape = tf.shape(tensor) - tensor_flatten = tf.reshape(tensor, [-1]) - - scaler = tf.math.reduce_max(tf.abs(tensor_flatten)) - new_tensor = tensor_flatten / scaler - sign = tf.sign(tensor_flatten) - new_tensor = tf.abs(new_tensor) - - import tensorflow_probability as tfp - edges = dict128 - bins = tf.cast(tfp.stats.find_bins(new_tensor, edges), dtype=tf.int8) - - scaler = tf.reshape(scaler, [-1]) - tensor_compressed = bins * tf.cast(sign, dtype=tf.int8), scaler - - return tensor_compressed, tensor_shape - - def decompress(self, tensor_compressed, tensor_shape): - tensor, scaler = tensor_compressed - dict128 = tf.constant( - [ - 1.5000001e-06, 2.7500000e-06, 7.2499997e-06, 1.8750001e-05, - 3.6250000e-05, 5.8749996e-05, 8.6249995e-05, 1.4375000e-04, - 2.3125000e-04, 3.1875001e-04, 4.0625001e-04, 5.1874999e-04, - 6.5624999e-04, 7.9374999e-04, 9.3124999e-04, 1.2187500e-03, - 1.6562500e-03, 2.0937501e-03, 2.5312500e-03, 2.9687500e-03, - 3.4062499e-03, 3.8437501e-03, 4.2812498e-03, 4.8437500e-03, - 5.5312500e-03, 6.2187500e-03, 6.9062500e-03, 7.5937500e-03, - 8.2812496e-03, 8.9687500e-03, 9.6562495e-03, 1.1093750e-02, - 1.3281250e-02, 1.5468750e-02, 1.7656250e-02, 1.9843750e-02, - 2.2031249e-02, 2.4218749e-02, 2.6406251e-02, 2.8593751e-02, - 3.0781250e-02, 3.2968748e-02, 3.5156250e-02, 3.7343752e-02, - 3.9531250e-02, 4.1718751e-02, 4.3906249e-02, 4.6718750e-02, - 5.0156251e-02, 5.3593751e-02, 5.7031251e-02, 6.0468748e-02, - 6.3906237e-02, 6.7343749e-02, 7.0781253e-02, 7.4218743e-02, - 7.7656247e-02, 8.1093743e-02, 8.4531240e-02, 8.7968737e-02, - 9.1406241e-02, 9.4843738e-02, 9.8281242e-02, 1.0546875e-01, - 1.1640625e-01, 1.2734374e-01, 1.3828126e-01, 1.4921875e-01, - 1.6015625e-01, 1.7109375e-01, 1.8203124e-01, 1.9296876e-01, - 2.0390625e-01, 2.1484375e-01, 2.2578125e-01, 2.3671874e-01, - 2.4765626e-01, 2.5859374e-01, 2.6953125e-01, 2.8046876e-01, - 2.9140624e-01, 3.0234376e-01, 3.1328124e-01, 3.2421875e-01, - 3.3515626e-01, 3.4609374e-01, 3.5703126e-01, 3.6796874e-01, - 3.7890625e-01, 3.8984376e-01, 4.0078124e-01, 4.1171876e-01, - 4.2265624e-01, 4.3359375e-01, 4.4453126e-01, 4.5859376e-01, - 4.7578123e-01, 4.9296874e-01, 5.1015621e-01, 5.2734375e-01, - 5.4453123e-01, 5.6171870e-01, 5.7890624e-01, 5.9609371e-01, - 6.1328125e-01, 6.3046873e-01, 6.4765620e-01, 6.6484374e-01, - 6.8203121e-01, 6.9921869e-01, 7.1640623e-01, 7.3359370e-01, - 7.5078118e-01, 7.6796871e-01, 7.8515619e-01, 8.0234367e-01, - 8.1953120e-01, 8.3671868e-01, 8.5390615e-01, 8.7109369e-01, - 8.8828117e-01, 9.0546864e-01, 9.2265618e-01, 9.3984365e-01, - 9.5703113e-01, 9.7421867e-01, 9.9140614e-01, 9.9570298e-01, - ], dtype=tf.float32 - ) - # tensor is int8 - tensor = tf.cast(tensor, dtype=tf.int32) - sign = tf.cast(tf.sign(tensor), dtype=tf.float32) - index = tf.cast(tf.abs(tensor), dtype=tf.int32) - tensor_decompressed = tf.gather(dict128, index) * scaler * sign - tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/tensorflow/helper.py b/untitled folder/grace_dl/tensorflow/helper.py deleted file mode 100644 index 3a2f1ce..0000000 --- a/untitled folder/grace_dl/tensorflow/helper.py +++ /dev/null @@ -1,107 +0,0 @@ -def grace_from_params(params): - import horovod.tensorflow as hvd - world_size = hvd.size() - comp = params.get('compressor', 'none') - mem = params.get('memory', 'none') - comm = params.get('communicator', 'allreduce') - if comp == 'adaq': - from grace_dl.tensorflow.compressor.adaq import AdaqCompressor - compress_ratio = params.get('compress_ratio', 0.3) - compressor = AdaqCompressor(compress_ratio) - elif comp == 'dgc': - from grace_dl.tensorflow.compressor.dgc import DgcCompressor - compress_ratio = params.get('compress_ratio', 0.3) - compressor = DgcCompressor(compress_ratio) - elif comp == 'efsignsgd': - from grace_dl.tensorflow.compressor.efsignsgd import EFSignSGDCompressor - lr = params.get('lr', 0.1) - compressor = EFSignSGDCompressor(lr) - elif comp == 'fp16': - from grace_dl.tensorflow.compressor.fp16 import FP16Compressor - compressor = FP16Compressor() - elif comp == 'inceptionn': - from grace_dl.tensorflow.compressor.inceptionn import INCEPTIONNCompressor - error_bound = params.get('error_bound', 2e-10) - compressor = INCEPTIONNCompressor(error_bound) - elif comp == 'natural': - from grace_dl.tensorflow.compressor.natural import NaturalCompressor - compressor = NaturalCompressor() - elif comp == 'none': - from grace_dl.tensorflow.compressor.none import NoneCompressor - compressor = NoneCompressor() - elif comp == 'onebit': - from grace_dl.tensorflow.compressor.onebit import OneBitCompressor - compressor = OneBitCompressor() - elif comp == 'powersgd': - from grace_dl.tensorflow.compressor.powersgd import PowerSGDCompressor - momentum_factor = params.get('momentum_factor', 0.9) - compressor = PowerSGDCompressor(momentum_factor, world_size) - elif comp == 'qsgd': - from grace_dl.tensorflow.compressor.qsgd import QSGDCompressor - quantum_num = params.get('quantum_num', 127) - compressor = QSGDCompressor(quantum_num) - elif comp == 'randomk': - from grace_dl.tensorflow.compressor.randomk import RandomKCompressor - compress_ratio = params.get('compress_ratio', 0.3) - compressor = RandomKCompressor(compress_ratio) - elif comp == 'signsgd': - from grace_dl.tensorflow.compressor.signsgd import SignSGDCompressor - compressor = SignSGDCompressor() - elif comp == 'signum': - from grace_dl.tensorflow.compressor.signum import SignumCompressor - momentum = params.get('momentum', 0.9) - compressor = SignumCompressor(momentum) - elif comp == 'sketch': - from grace_dl.tensorflow.compressor.sketch import SketchCompressor - quantiles = params.get('quantiles', 64) - compressor = SketchCompressor(quantiles) - elif comp == 'terngrad': - from grace_dl.tensorflow.compressor.terngrad import TernGradCompressor - compressor = TernGradCompressor() - elif comp == 'threshold': - from grace_dl.tensorflow.compressor.threshold import ThresholdCompressor - threshold = params.get('threshold', 0.01) - compressor = ThresholdCompressor(threshold) - elif comp == 'topk': - from grace_dl.tensorflow.compressor.topk import TopKCompressor - compress_ratio = params.get('compress_ratio', 0.3) - compressor = TopKCompressor(compress_ratio) - elif comp == 'u8bit': - from grace_dl.tensorflow.compressor.u8bit import U8bitCompressor - compressor = U8bitCompressor() - else: - raise NotImplementedError(comp) - - if mem == 'dgc': - from grace_dl.tensorflow.memory.dgc import DgcMemory - momentum = params.get('momentum', 0.9) - gradient_clipping = params.get('gradient_clipping', False) - memory = DgcMemory(momentum, gradient_clipping, world_size) - elif mem == 'none': - from grace_dl.tensorflow.memory.none import NoneMemory - memory = NoneMemory() - elif mem == 'powersgd': - from grace_dl.tensorflow.memory.powersgd import PowerSGDMemory - compress_rank = params.get('compress_rank', 1) - memory = PowerSGDMemory(compressor.q_memory, compress_rank) - elif mem == 'residual': - from grace_dl.tensorflow.memory.residual import ResidualMemory - memory = ResidualMemory() - elif mem == 'efsignsgd': - from grace_dl.tensorflow.memory.efsignsgd import EFSignSGDMemory - lr = params.get('lr', 0.1) - memory = EFSignSGDMemory(lr) - else: - raise NotImplementedError(mem) - - if comm == 'allreduce': - from grace_dl.tensorflow.communicator.allreduce import Allreduce - return Allreduce(compressor, memory, world_size) - elif comm == 'allgather': - from grace_dl.tensorflow.communicator.allgather import Allgather - return Allgather(compressor, memory, world_size) - elif comm == 'broadcast': - from grace_dl.tensorflow.communicator.broadcast import Broadcast - return Broadcast(compressor, memory, world_size) - else: - raise NotImplementedError(comm) diff --git a/untitled folder/grace_dl/tensorflow/memory/.DS_Store b/untitled folder/grace_dl/tensorflow/memory/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0fgc5S?v9>(Bx@v{HMs#5IVdqD2)KH-tOF2o8XP9b2u1?X}{BsEQ)_4u6O% z2Yv!vIl?bM+?d(jrgmZn^@3=-W9>d?cV<7{i@llz09a?_wE=bjz`#P-*ubj9sGrJ$ zt*Dl5M27JIBZwh@J~%L3@YW5Q0nLDBKr^5j&lQ}vRBud0nNa*WPsie zCKf_VVJ1<$IR58IJuXxg%N%Gl?=B zm<$J|W@aiwVPbZSGle@aOQNnd1Db(F1}Jym#EhLogjYU)=P+=`Y2t?5MW0V@5(eq8 z)%wa-R%fxcvGth8Lnny+fd~jZ zA6;I*3}TOuJA4#-sfe{@hfz1`y~gfja&P~zY3?1gr%iLxZav&tgDzPE*rqGFBB`To38 z!S9(;g)t(2u27XvQJJTtOks>UMW|n!X5ex$uqv0kL;3%c>fiq_*QWGpH3OP~3o*cI zUAx=C5ISzljB;%g%UdjDh_gQo@!mvNim?=C60rsCpZ^eWs?UEx KysoY_1AhR)|7g?z diff --git a/untitled folder/grace_dl/torch/__init__.py b/untitled folder/grace_dl/torch/__init__.py deleted file mode 100644 index 4030ee4..0000000 --- a/untitled folder/grace_dl/torch/__init__.py +++ /dev/null @@ -1,58 +0,0 @@ -from abc import ABC, abstractmethod - - -class Memory(ABC): - @abstractmethod - def compensate(self, tensor, name): - """Update the tensor with the residuals.""" - raise NotImplemented("compensate was not implemented.") - - def update(self, tensor, name, compressor, tensor_compressed, ctx): - """Update the residuals.""" - pass - - -class Compressor(ABC): - """Interface for compressing and decompressing a given tensor.""" - - def __init__(self, average=True, tensors_size_are_same=True): - self.average = average - self.tensors_size_are_same = tensors_size_are_same - - @abstractmethod - def compress(self, tensor, name): - """Compresses a tensor and returns it with the context needed to decompress it.""" - raise NotImplemented("compress was not implemented.") - - @abstractmethod - def decompress(self, tensors, ctx): - """Decompress the tensor with the given context.""" - raise NotImplemented("decompress was not implemented.") - - def aggregate(self, tensors): - """Aggregate a list of tensors.""" - return sum(tensors) - - -class Communicator(ABC): - @abstractmethod - def async_send(self, tensors, name,process_set): - raise NotImplemented("async_send was not implemented.") - - @abstractmethod - def wait_receive(self, handles, ctx,process_set): - raise NotImplemented("wait_receive was not implemented.") - - def __init__(self, compressor, memory): - self.compressor = compressor - self.memory = memory - - def send_step(self, tensor, name,process_set): - tensor = self.memory.compensate(tensor, name) - tensors_compressed, ctx = self.compressor.compress(tensor, name) - self.memory.update(tensor, name, self.compressor, tensors_compressed, ctx) - handles = self.async_send(tensors_compressed, name,process_set) - return handles, ctx - - def receive_step(self, handles, ctx,process_set): - return self.wait_receive(handles, ctx,process_set) diff --git a/untitled folder/grace_dl/torch/communicator/.DS_Store b/untitled folder/grace_dl/torch/communicator/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T00 else t.numel() - tensors_size.append(size_dim0) - - if self.compressor.tensors_size_are_same: - tensors_size_ag = [tensors_size] * self.world_size # list of tensor sizes per rank - tensor_sizes = zip(*tensors_size_ag) # transpose - else: - tensors_size = torch.tensor(tensors_size) # TODO: set device - gathered = allgather(tensor=tensors_size,process_set=process_set) # tensor of tensor sizes per rank - tensor_sizes = gathered.view([self.world_size, -1]).t().tolist() # transpose, to list - - handles = [] - for tensor_compressed in tensors_compressed: - handle = allgather_async(tensor=tensor_compressed,process_set=process_set) - handles.append(handle) - - return handles, tensor_sizes - - def wait_receive(self, result, ctx,process_set): - handles, tensor_sizes = result - # print("Result",result) - tensors_ag = [] - for handle, sizes in zip(handles, tensor_sizes): - gathered = synchronize(handle) - # print("gathered",gathered) - # print("gathered",len(gathered)) - # print("sizes",sizes) - tensors_ag.append(gathered.split(sizes)) - # import time - # time.sleep(0.001) - list_tensor_decompressed = [] - for tensor_compressed in zip(*tensors_ag): - tensor_decompressed = self.compressor.decompress(tensor_compressed, ctx) - list_tensor_decompressed.append(tensor_decompressed) - - tensors_aggregated = self.compressor.aggregate(list_tensor_decompressed) - return (tensors_aggregated / self.world_size) if self.compressor.average else tensors_aggregated diff --git a/untitled folder/grace_dl/torch/communicator/allreduce.py b/untitled folder/grace_dl/torch/communicator/allreduce.py deleted file mode 100644 index 98e5396..0000000 --- a/untitled folder/grace_dl/torch/communicator/allreduce.py +++ /dev/null @@ -1,15 +0,0 @@ -from grace.grace_dl.torch import Communicator -from horovod.torch import allreduce_async_, synchronize - - -class Allreduce(Communicator): - - def async_send(self, tensors_compressed, name): - handles = [] - for i, tensor_compressed in enumerate(tensors_compressed): - handles.append(allreduce_async_(tensor_compressed, self.compressor.average, name + str(i))) - return handles - - def wait_receive(self, handles, ctx): - output = [synchronize(h) for h in handles] - return self.compressor.decompress(output, ctx) diff --git a/untitled folder/grace_dl/torch/communicator/broadcast.py b/untitled folder/grace_dl/torch/communicator/broadcast.py deleted file mode 100644 index 7b906e6..0000000 --- a/untitled folder/grace_dl/torch/communicator/broadcast.py +++ /dev/null @@ -1,27 +0,0 @@ -from grace_dl.torch import Communicator -from horovod.torch import broadcast_async, synchronize - - -class Broadcast(Communicator): - - def __init__(self, compressor, memory, world_size): - super().__init__(compressor, memory) - self.world_size = world_size - - def async_send(self, tensors_compressed, name): - handles = [] - for root_rank in range(self.world_size): - rank_handles = [] - for i, tensor_compressed in enumerate(tensors_compressed): - rank_handles.append(broadcast_async(tensor_compressed, root_rank, name + str(root_rank) + '_' + str(i))) - handles.append(rank_handles) - return handles - - def wait_receive(self, handles, ctx): - tensors_decompressed = [] - for ranki in handles: - tensors_compressed = [synchronize(h) for h in ranki] - tensor_decompressed = self.compressor.decompress(tensors_compressed, ctx) - tensors_decompressed.append(tensor_decompressed) - tensor_aggregated = self.compressor.aggregate(tensors_decompressed) - return (tensor_aggregated / self.world_size) if self.compressor.average else tensor_aggregated diff --git a/untitled folder/grace_dl/torch/compressor/.DS_Store b/untitled folder/grace_dl/torch/compressor/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0= thr - selected = mask.sum() - - for _ in range(10): - if selected > 1.3 * numel * self.compress_ratio: - thr = 1.3 * thr - elif selected < 0.7 * numel * self.compress_ratio: - thr = 0.7 * thr - else: - break - mask = tensor.abs() >= thr - selected = mask.sum() - - indices, = torch.where(mask) - values = tensor[indices] - - tensor_compressed = values, indices - ctx = shape, mask, numel - return tensor_compressed, ctx - - def decompress(self, tensor_compressed, ctx): - values, indices = tensor_compressed - shape, _, numel = ctx - tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) - tensor_decompressed.scatter_(0, indices, values) - return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/torch/compressor/efsignsgd.py b/untitled folder/grace_dl/torch/compressor/efsignsgd.py deleted file mode 100644 index f981d0f..0000000 --- a/untitled folder/grace_dl/torch/compressor/efsignsgd.py +++ /dev/null @@ -1,33 +0,0 @@ -import torch - -from grace_dl.torch import Compressor - - -class EFSignSGDCompressor(Compressor): - - def __init__(self, lr): - super().__init__(average=False) - self.learning_rate = lr - - def compress(self, tensor, name): - """Encoding and compressing the signs """ - shape = tensor.size() - tensor = tensor.flatten() - - sign_encode = tensor >= 0 - mean = tensor.abs().mean() - tensor_compressed = mean, sign_encode.type(torch.uint8) - - return tensor_compressed, shape - - def decompress(self, tensor_compressed, shape): - """Decoding the signs to float format """ - mean, sign_encode = tensor_compressed - sign_decode = sign_encode.type(torch.float32) * 2 - 1 - sign_decode = mean * sign_decode - tensor_decompressed = sign_decode.view(shape) - return tensor_decompressed - - def aggregate(self, tensors): - """Aggregate a list of tensors.""" - return sum(tensors) / self.learning_rate diff --git a/untitled folder/grace_dl/torch/compressor/fp16.py b/untitled folder/grace_dl/torch/compressor/fp16.py deleted file mode 100644 index 5021ae6..0000000 --- a/untitled folder/grace_dl/torch/compressor/fp16.py +++ /dev/null @@ -1,22 +0,0 @@ -import torch - -from grace_dl.torch import Compressor - - -class FP16Compressor(Compressor): - """Compress all floating point gradients to 16-bit.""" - - def compress(self, tensor): - """Downcasts the tensor to 16-bit.""" - dtype = tensor.dtype - if dtype.is_floating_point: - # Only allow compression from other floating point types - tensor = tensor.type(torch.float16) - return tensor, dtype - - def decompress(self, tensors, dtype): - """Upcasts the tensor to the initialization dtype.""" - tensor_decompressed = tensors - if dtype.is_floating_point: - tensor_decompressed = tensor_decompressed.type(dtype) - return tensor_decompressed diff --git a/untitled folder/grace_dl/torch/compressor/natural.py b/untitled folder/grace_dl/torch/compressor/natural.py deleted file mode 100644 index a208071..0000000 --- a/untitled folder/grace_dl/torch/compressor/natural.py +++ /dev/null @@ -1,40 +0,0 @@ -import torch -import cupy # conda install -c conda-forge cupy=7.0.0=py37h0c141eb_2 -from torch.utils.dlpack import to_dlpack -from torch.utils.dlpack import from_dlpack - -from grace_dl.torch import Compressor - - -class NaturalCompressor(Compressor): - def __init__(self): - super().__init__() - - def compress(self, tensor, name): - shape = tensor.size() - tensor_flatten = tensor.flatten() - cupy_tensor = cupy.fromDlpack(to_dlpack(tensor_flatten)) - tensor_cast = cupy_tensor.view(cupy.int32) - sign = tensor_cast & cupy.int32(0b10000000000000000000000000000000) - exp = tensor_cast & cupy.int32(0b01111111100000000000000000000000) - mantissa = tensor_cast & cupy.int32(0b00000000011111111111111111111111) - exp_add_one = mantissa > cupy.random.randint(low=0, high=0b00000000011111111111111111111111, - size=cupy_tensor.shape, - dtype=cupy.int32) - exponent = cupy.where(exp_add_one, exp + 0b00000000100000000000000000000000, exp) - exp_shift = cupy.clip(exponent, a_min=0b00001001000000000000000000000000, a_max=0b01001000100000000000000000000000) - exps = cupy.right_shift(exp_shift, 23) - exps = cupy.bitwise_or(cupy.right_shift(sign, 24), exps - 18) - tensor_compressed = exps.astype(cupy.uint8) - return [from_dlpack(tensor_compressed.toDlpack())], shape - - - def decompress(self, tensor_compressed, shape): - tensor_compressed, = tensor_compressed - cupy_tensor = cupy.fromDlpack(to_dlpack(tensor_compressed)) - sign = cupy_tensor > 127 - exps = cupy.bitwise_and(cupy_tensor, 0b01111111) - floats = cupy.left_shift((exps + 18).astype(cupy.int32), 23).view(cupy.float32) - tensor_decompressed = cupy.where(sign, -floats, floats) - tensor_decompressed = cupy.multiply((exps >= 1).astype(cupy.float32), tensor_decompressed) - return from_dlpack(tensor_decompressed.toDlpack()).view(shape) diff --git a/untitled folder/grace_dl/torch/compressor/none.py b/untitled folder/grace_dl/torch/compressor/none.py deleted file mode 100644 index 60e5916..0000000 --- a/untitled folder/grace_dl/torch/compressor/none.py +++ /dev/null @@ -1,12 +0,0 @@ -from grace_dl.torch import Compressor - - -class NoneCompressor(Compressor): - """Default no-op compression.""" - - def compress(self, tensor, name): - return [tensor], None - - def decompress(self, tensors, ctx): - tensor, = tensors - return tensor diff --git a/untitled folder/grace_dl/torch/compressor/onebit.py b/untitled folder/grace_dl/torch/compressor/onebit.py deleted file mode 100644 index 9758734..0000000 --- a/untitled folder/grace_dl/torch/compressor/onebit.py +++ /dev/null @@ -1,32 +0,0 @@ -import torch - -from grace_dl.torch import Compressor - - -class OneBitCompressor(Compressor): - - def compress(self, tensor, name): - shape = tensor.size() - tensor = tensor.flatten() - numel = tensor.numel() - - mask0 = tensor < 0 - sum0 = torch.sum(tensor[mask0]) - num0 = torch.sum(mask0).float() - mean0 = sum0 / num0 if num0 > 0 else sum0 - - mask1 = ~mask0 - sum1 = torch.sum(tensor[mask1]) - num1 = numel - num0 - mean1 = sum1 / num1 if num1 > 0 else sum1 - - tensor_compressed = mask0.type(torch.uint8), mean0, mean1 - - return tensor_compressed, shape - - def decompress(self, tensor_compressed, shape): - mask0, mean0, mean1 = tensor_compressed - mask0= mask0.bool() - tensor_decompressed = mask0 * mean0 + ~mask0 * mean1 - tensor_decompressed = tensor_decompressed.view(shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/torch/compressor/powersgd.py b/untitled folder/grace_dl/torch/compressor/powersgd.py deleted file mode 100644 index 9dffca5..0000000 --- a/untitled folder/grace_dl/torch/compressor/powersgd.py +++ /dev/null @@ -1,58 +0,0 @@ -import torch - -from grace_dl.torch import Compressor -from horovod.torch import allreduce_ - - -@torch.jit.script -def orthogonalize(matrix): - n, m = matrix.shape - for i in range(m): - # Normalize the i'th column - col = matrix[:, i: i + 1] - col /= torch.sqrt(torch.sum(col ** 2)) - # Project it on the rest and remove it - if i + 1 < m: - rest = matrix[:, i + 1:] - # rest -= torch.matmul(col.t(), rest) * col - rest -= torch.sum(col * rest, dim=0) * col - - -class PowerSGDCompressor(Compressor): - """ - PowerSGD: Practical Low-Rank Gradient Compression for Distributed Optimization. - T. Vogels, S. P. Karimireddy, and M. Jaggi. In NeurIPS, 2019. - """ - - def __init__(self): - super().__init__() - self.q_memory = {} - - def compress(self, tensor, name): - if tensor.dim() == 1: - return [tensor], None - - shape = tensor.size() - matrix = tensor.view([shape[0], -1]) - q = self.q_memory[name] - # q, _ = torch.qr(q) - orthogonalize(q) - - p = torch.mm(matrix, q) - p = allreduce_(p) - # p, _ = torch.qr(p) - orthogonalize(p) - q = torch.mm(matrix.t(), p) - q = allreduce_(q) - ctx = p, q, shape - self.q_memory[name] = q - return [], ctx - - def decompress(self, tensors, ctx): - if ctx is None: - tensor, = tensors - return tensor - p, q, tensor_shape = ctx - new_tensor = torch.mm(p, q.t()) - tensor_decompressed = new_tensor.view(tensor_shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/torch/compressor/qsgd.py b/untitled folder/grace_dl/torch/compressor/qsgd.py deleted file mode 100644 index 0c08dce..0000000 --- a/untitled folder/grace_dl/torch/compressor/qsgd.py +++ /dev/null @@ -1,39 +0,0 @@ -import torch - -from grace_dl.torch import Compressor - - -class QSGDCompressor(Compressor): - - def __init__(self, quantum_num): - super().__init__() - self.quantum_num = quantum_num - - def compress(self, tensor, name): - shape = tensor.size() - tensor = tensor.flatten() - - norm = tensor.norm() - norm = norm.flatten() - abs_gradient = tensor.abs() - - level_float = self.quantum_num / norm * abs_gradient - previous_level = level_float.floor() - prob = torch.empty_like(tensor).uniform_() - is_next_level = (prob < (level_float - previous_level)).type(torch.float32) - new_level = (previous_level + is_next_level) - - sign = tensor.sign() - tensor_compressed = (new_level * sign).type(torch.int16) - tensor_compressed = tensor_compressed.type(torch.int8 if self.quantum_num < 128 else torch.half) - tensor_compressed = tensor_compressed, norm - - return tensor_compressed, shape - - def decompress(self, tensor_compressed, shape): - tensor_compressed, norm = tensor_compressed - - decode_output = tensor_compressed.type(torch.float32) - tensor_decompressed = norm / self.quantum_num * decode_output - tensor_decompressed = tensor_decompressed.view(shape) - return tensor_decompressed diff --git a/untitled folder/grace_dl/torch/compressor/randomk.py b/untitled folder/grace_dl/torch/compressor/randomk.py deleted file mode 100644 index c345956..0000000 --- a/untitled folder/grace_dl/torch/compressor/randomk.py +++ /dev/null @@ -1,40 +0,0 @@ -import torch - -from grace.grace_dl.torch import Compressor - - -def sparsify(tensor, compress_ratio): - tensor = tensor.flatten() - numel = tensor.numel() - k = max(1, int(numel * compress_ratio)) - indices = torch.randperm(numel, device=tensor.device)[:k] - values = tensor[indices] - return indices, values - - -class RandomKCompressor(Compressor): - """Python libraries Based Compress by performing sparsification (i.e., sending a ratio of the actual tensor size.""" - - def __init__(self, compress_ratio): - super().__init__() - self.global_step = 0 - self.compress_ratio = compress_ratio - - def compress(self, tensor, name): - """Use Python Random libraries RNG to compress by generating a list of indices to be transmitted.""" - - h = sum(bytes(name, encoding='utf8'), self.global_step) - self.global_step += 1 - torch.manual_seed(h) - indices, values = sparsify(tensor, self.compress_ratio) - - ctx = indices, tensor.numel(), tensor.size() - return [values], ctx - - def decompress(self, tensors, ctx): - """Decompress by filling empty slots with zeros and reshape back using the original shape""" - indices, numel, shape = ctx - values, = tensors - tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) - tensor_decompressed.scatter_(0, indices, values) - return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/torch/compressor/signsgd.py b/untitled folder/grace_dl/torch/compressor/signsgd.py deleted file mode 100644 index 53637e1..0000000 --- a/untitled folder/grace_dl/torch/compressor/signsgd.py +++ /dev/null @@ -1,30 +0,0 @@ -import torch - -from grace_dl.torch import Compressor - - -class SignSGDCompressor(Compressor): - - def __init__(self): - super().__init__(average=False) - - def compress(self, tensor, name): - shape = tensor.size() - tensor = tensor.flatten() - - tensor_compressed = tensor >= 0 - return [tensor_compressed.type(torch.uint8)], shape - - def decompress(self, tensors, shape): - """Decoding the signs to float format """ - sign_encode, = tensors - sign_decode = sign_encode.type(torch.float32) * 2 - 1 - tensor_decompressed = sign_decode.view(shape) - return tensor_decompressed - - def aggregate(self, tensors): - """Aggregate a list of tensors.""" - agged_tensor = sum(tensors) - sign = agged_tensor >= 0 - agged_tensor = sign * 2.0 - 1.0 - return agged_tensor diff --git a/untitled folder/grace_dl/torch/compressor/signum.py b/untitled folder/grace_dl/torch/compressor/signum.py deleted file mode 100644 index 5456633..0000000 --- a/untitled folder/grace_dl/torch/compressor/signum.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch - -from grace_dl.torch import Compressor - - -class SignumCompressor(Compressor): - - def __init__(self, momentum): - super().__init__(average=False) - self.momentum = momentum - self.momentums = {} - - def compress(self, tensor, name): - """Encoding and compressing the signs """ - shape = tensor.size() - tensor = tensor.flatten() - - # update tensor by momentum - if name in self.momentums: - tensor = (1.0 - self.momentum) * tensor + self.momentum * self.momentums[name] - self.momentums[name] = tensor - tensor_compressed = tensor >= 0 - return [tensor_compressed.type(torch.uint8)], shape - - def decompress(self, tensors, shape): - sign_encode, = tensors - """Decoding the signs to float format """ - sign_decode = sign_encode.type(torch.float32) * 2 - 1 - tensor_decompressed = sign_decode.view(shape) - return tensor_decompressed - - def aggregate(self, tensors): - """Aggregate a list of tensors.""" - agged_tensor = sum(tensors) - agged_tensor = agged_tensor >= 0 - agged_tensor = agged_tensor * 2.0 - 1.0 - return agged_tensor diff --git a/untitled folder/grace_dl/torch/compressor/terngrad.py b/untitled folder/grace_dl/torch/compressor/terngrad.py deleted file mode 100644 index cfe159c..0000000 --- a/untitled folder/grace_dl/torch/compressor/terngrad.py +++ /dev/null @@ -1,32 +0,0 @@ -import torch - -from grace_dl.torch import Compressor - - -class TernGradCompressor(Compressor): - - def compress(self, tensor, name): - shape = tensor.size() - tensor = tensor.flatten() - - std = (tensor - torch.mean(tensor)) ** 2 - std = torch.sqrt(torch.mean(std)) - c = 2.5 * std.item() - gradient = torch.clamp(tensor, -c, c) - abs_gradient = gradient.abs() - scalar = abs_gradient.max() - - sign_gradient = gradient.sign() * scalar - rnd_sample = torch.empty_like(tensor).uniform_(0, scalar.item()) - sign_gradient[rnd_sample >= abs_gradient] = 0 - new_sign = sign_gradient.sign() # -1, 0, 1 - - tensor_compressed = new_sign.type(torch.int8), scalar.flatten() - - return tensor_compressed, shape - - def decompress(self, tensor_compressed, shape): - tensor_compressed, scalar = tensor_compressed - sign = tensor_compressed.type(torch.float32) - tensor_decompressed = sign * scalar - return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/torch/compressor/threshold.py b/untitled folder/grace_dl/torch/compressor/threshold.py deleted file mode 100644 index c142d53..0000000 --- a/untitled folder/grace_dl/torch/compressor/threshold.py +++ /dev/null @@ -1,27 +0,0 @@ -import torch - -from grace_dl.torch import Compressor - - -class ThresholdCompressor(Compressor): - - def __init__(self, threshold): - super().__init__(tensors_size_are_same=False) - self.threshold = threshold - - def compress(self, tensor, name): - shape = tensor.size() - tensor = tensor.flatten() - numel = tensor.numel() - - indices, = torch.where(tensor.abs() > self.threshold) - values = tensor[indices] - ctx = shape, numel - return [values, indices], ctx - - def decompress(self, tensor_compressed, ctx): - shape, numel = ctx - values, indices = tensor_compressed - tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) - tensor_decompressed.scatter_(0, indices, values) - return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/torch/compressor/topk.py b/untitled folder/grace_dl/torch/compressor/topk.py deleted file mode 100644 index 9ccfc2d..0000000 --- a/untitled folder/grace_dl/torch/compressor/topk.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch - -from grace.grace_dl.torch import Compressor - - -def sparsify(tensor, compress_ratio): - tensor = tensor.flatten() - k = max(1, int(tensor.numel() * compress_ratio)) - _, indices = torch.topk(tensor.abs(), k, sorted=False,) - values = torch.gather(tensor, 0, indices) - return values, indices - - -def desparsify(tensors, numel): - values, indices = tensors - # print("values, indices",values, indices,numel) - tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) - tensor_decompressed.scatter_(0, indices, values) - return tensor_decompressed - - -class TopKCompressor(Compressor): - - def __init__(self, compress_ratio): - super().__init__()#tensors_size_are_same=False - self.compress_ratio = compress_ratio - - def compress(self, tensor, name): - tensors = sparsify(tensor, self.compress_ratio) - ctx = tensor.numel(), tensor.size() - return tensors, ctx - - def decompress(self, tensors, ctx): - """Decompress by filling empty slots with zeros and reshape back using the original shape""" - numel, shape = ctx - tensor_decompressed = desparsify(tensors, numel) - return tensor_decompressed.view(shape) diff --git a/untitled folder/grace_dl/torch/helper.py b/untitled folder/grace_dl/torch/helper.py deleted file mode 100644 index 0796893..0000000 --- a/untitled folder/grace_dl/torch/helper.py +++ /dev/null @@ -1,96 +0,0 @@ -def grace_from_params(params,processset): - import sys - sys.path.append("/Users/garvitbanga/Downloads/AritraDutta/FedCola-LOCAL/grace/") - sys.path.append("/content/grace/") - import horovod.torch as hvd - world_size = processset.size() - # print("world_size",world_size,processset.size()) - comp = params.get('compressor', 'none') - mem = params.get('memory', 'none') - comm = params.get('communicator', 'allreduce') - if comp == 'dgc': - from grace_dl.torch.compressor.dgc import DgcCompressor - compress_ratio = params.get('compress_ratio', 0.3) - compressor = DgcCompressor(compress_ratio) - elif comp == 'efsignsgd': - from grace_dl.torch.compressor.efsignsgd import EFSignSGDCompressor - lr = params.get('lr', 0.1) - compressor = EFSignSGDCompressor(lr) - elif comp == 'fp16': - from grace_dl.torch.compressor.fp16 import FP16Compressor - compressor = FP16Compressor() - elif comp == 'natural': - from grace_dl.torch.compressor.natural import NaturalCompressor - compressor = NaturalCompressor() - elif comp == 'none': - from grace_dl.torch.compressor.none import NoneCompressor - compressor = NoneCompressor() - elif comp == 'onebit': - from grace_dl.torch.compressor.onebit import OneBitCompressor - compressor = OneBitCompressor() - elif comp == 'powersgd': - from grace_dl.torch.compressor.powersgd import PowerSGDCompressor - compressor = PowerSGDCompressor() - elif comp == 'qsgd': - from grace_dl.torch.compressor.qsgd import QSGDCompressor - quantum_num = params.get('quantum_num', 127) - compressor = QSGDCompressor(quantum_num) - elif comp == 'randomk': - from grace_dl.torch.compressor.randomk import RandomKCompressor - compress_ratio = params.get('compress_ratio', 0.3) - compressor = RandomKCompressor(compress_ratio) - elif comp == 'signsgd': - from grace_dl.torch.compressor.signsgd import SignSGDCompressor - compressor = SignSGDCompressor() - elif comp == 'signum': - from grace_dl.torch.compressor.signum import SignumCompressor - momentum = params.get('momentum', 0.9) - compressor = SignumCompressor(momentum) - elif comp == 'terngrad': - from grace_dl.torch.compressor.terngrad import TernGradCompressor - compressor = TernGradCompressor() - elif comp == 'threshold': - from grace_dl.torch.compressor.threshold import ThresholdCompressor - threshold = params.get('threshold', 0.01) - compressor = ThresholdCompressor(threshold) - elif comp == 'topk': - from grace_dl.torch.compressor.topk import TopKCompressor - compress_ratio = params.get('compress_ratio', 0.3) - compressor = TopKCompressor(compress_ratio) - else: - raise NotImplementedError(comp) - - if mem == 'dgc': - from grace_dl.torch.memory.dgc import DgcMemory - momentum = params.get('momentum', 0.9) - gradient_clipping = params.get('gradient_clipping', False) - memory = DgcMemory(momentum, gradient_clipping) - elif mem == 'none': - from grace_dl.torch.memory.none import NoneMemory - memory = NoneMemory() - elif mem == 'powersgd': - from grace_dl.torch.memory.powersgd import PowerSGDMemory - compress_rank = params.get('compress_rank', 1) - memory = PowerSGDMemory(compressor.q_memory, compress_rank) - elif mem == 'residual': - from grace_dl.torch.memory.residual import ResidualMemory - memory = ResidualMemory() - elif mem == 'efsignsgd': - from grace_dl.torch.memory.efsignsgd import EFSignSGDMemory - lr = params.get('lr', 0.1) - memory = EFSignSGDMemory(lr) - else: - raise NotImplementedError(mem) - - if comm == 'allreduce': - from grace_dl.torch.communicator.allreduce import Allreduce - return Allreduce(compressor, memory) - elif comm == 'allgather': - from grace_dl.torch.communicator.allgather import Allgather - # print("compressor",compressor) - return Allgather(compressor, memory, world_size) - elif comm == 'broadcast': - from grace_dl.torch.communicator.broadcast import Broadcast - return Broadcast(compressor, memory, world_size) - else: - raise NotImplementedError(comm) diff --git a/untitled folder/grace_dl/torch/memory/.DS_Store b/untitled folder/grace_dl/torch/memory/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0S5Z-O8O(;SS3VK`cTCpuw5icRu7cim+m717hgE3o@v^|tU?)pN$h|lB9 z?pCzbiwBW11GC>|em2=}%T9+e#@$iaVyw;>vp^9`HVoeg`cappWGrPsuGSbc$--cg z4TE?onhgJu0X(}Z8^F1xpr6i92H<%OhH;eT&E^{~l`CuORjX>%tXpp&CtmL7qs;Zw zYkazpG7jc*Ke&wglU{xER3^C}CH+(-L}4F7ZmyyvloMBuk}y-bo_1KaW%uek)9GI4 zxFOnytyx1%Tg{_(LmV7;W;5H`+TA}n?>)v(iG0>va`@RP*)TYV7nm$;E6ncW06r5# z7F=U5O=6imfM>oSMv)K$!~ij{stlOB&Z@2I25Dc!05R}u2Jn6mpoosaLZiAmU{Eap zUV4+c`Gj5s>Zr;pIhr-p{VSS;( z8Fw_&NDL4IUm3vO52TLu|Lo`fUj@-X3=jjW$pEi(y{-#;GqrVTmso2B=p85u#uXYr lQedJ=F~nji-Ud|yc7YQ>$6%omEFknpK+!-0G4Q7hyaQ+(xSXAMfhz`8(GT2E#oBs+mo0)+a|hqE<8CF}q*6d|oI#?!EVLzwqd`6B+bk6ZvoKUf_%1 z3T7Q|L@xzfWB?gJ2ELyGbMCOY?@xspkO5@ik1-(62M#LHG8k)AM+Y>j1OUu{Sqb!I zEg@sLLCavQ5k^3$4h7Vq)RY)hhl8CPKg(dOQHK+1iVtcsQ&XW(l^y2i%AHV4Beuu@ zGBC+NT0iFH{eSRz{XZFmPsji=@UIx4nVMaz!j{b4I<+}@*K*JcP$|+c*7z0z9d#9h fue^$vL6v}=%MPGrFxChb5d0BPG+=`a{3rvj5fp6x diff --git a/untitled folder/patch_files/horovod/torch/.DS_Store b/untitled folder/patch_files/horovod/torch/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 input, output -# We keep input in order to make sure it does not get garbage collected -# before the operation is finished. -_handle_map = {} - - -def _check_function(function_factory, tensor): - function = function_factory(tensor) - if not hasattr(mpi_lib, function): - raise ValueError('Tensor type %s is not supported.' % tensor.type()) - if not tensor.is_contiguous(): - raise ValueError('Tensor is required to be contiguous.') - return function - - -def _allreduce_function_factory(tensor): - return 'horovod_torch_allreduce_async_' + tensor.type().replace('.', '_') - - -def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor, process_set: ProcessSet): - # Set the divisor for reduced gradients to average when necessary - if op == Average: - if rocm_built(): - # For ROCm, perform averaging at framework level - divisor = process_set.size() - op = Sum - else: - divisor = 1 - - elif op == Adasum: - if process_set != global_process_set: - raise NotImplementedError("Adasum does not support non-global process sets yet.") - if tensor.device.type != 'cpu' and gpu_available('torch'): - if nccl_built(): - if not is_homogeneous(): - raise NotImplementedError('Running GPU Adasum on heterogeneous cluster is not supported yet.') - elif not num_rank_is_power_2(int(size() / local_size())): - raise NotImplementedError('Running GPU Adasum with non-power of 2 nodes is not supported yet.') - if rocm_built(): - # For ROCm, perform averaging at framework level - divisor = local_size() - else: - divisor = 1 - else: - warnings.warn('Adasum reduction does not currently support GPU reduction using MPI. Tensors are ' - 'copied to CPU memory instead. To use Adasum for GPU reduction, please compile Horovod ' - 'with HOROVOD_GPU_OPERATIONS=NCCL.') - divisor = 1 - else: - if not num_rank_is_power_2(size()): - raise NotImplementedError('Running Adasum with non-power of 2 ranks is not supported yet.') - divisor = 1 - else: - divisor = 1 - - function = _check_function(_allreduce_function_factory, tensor) - try: - handle = getattr(mpi_lib, function)(tensor, output, divisor, - name.encode() if name is not None else _NULL, op, - prescale_factor, postscale_factor, process_set.process_set_id) - except RuntimeError as e: - raise HorovodInternalError(e) - _handle_map[handle] = (tensor, output) - return handle - - -def allreduce_async(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): - """ - A function that performs asynchronous averaging or summation of the input tensor - over all the Horovod processes. The input tensor is not modified. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The reduction will not start until all processes - are ready to send and receive the tensor. - - Arguments: - tensor: A tensor to reduce. - average: - .. warning:: .. deprecated:: 0.19.0 - - Use `op` instead. Will be removed in v1.0. - - name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. - prescale_factor: Multiplicative factor to scale tensor before allreduce. - postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A handle to the allreduce operation that can be used with `poll()` or - `synchronize()`. - """ - op = handle_average_backwards_compatibility(op, average) - output = tensor.new(tensor.shape) - return _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor, process_set) - - -class HorovodAllreduce(torch.autograd.Function): - """An autograd function that performs allreduce on a tensor.""" - - @staticmethod - def forward(ctx, tensor, average, name, op, prescale_factor, postscale_factor, process_set): - ctx.average = average - ctx.op = op - ctx.prescale_factor = prescale_factor - ctx.postscale_factor = postscale_factor - ctx.process_set = process_set - handle = allreduce_async(tensor, average, name, op, prescale_factor, postscale_factor, process_set) - return synchronize(handle) - - @staticmethod - def backward(ctx, grad_output): - return allreduce(grad_output, average=ctx.average, op=ctx.op, - prescale_factor=ctx.prescale_factor, - postscale_factor=ctx.postscale_factor, - process_set=ctx.process_set), None, None, None, None, None, None - - -def allreduce(tensor, average=None, name=None, compression=Compression.none, op=None, - prescale_factor=1.0, postscale_factor=1.0, process_set=global_process_set): - """ - A function that performs averaging or summation of the input tensor over all the - Horovod processes. The input tensor is not modified. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The reduction will not start until all processes - are ready to send and receive the tensor. - - This acts as a thin wrapper around an autograd function. If your input - tensor requires gradients, then callings this function will allow gradients - to be computed and backpropagated. - - Arguments: - tensor: A tensor to reduce. - average: - .. warning:: .. deprecated:: 0.19.0 - - Use `op` instead. Will be removed in v1.0. - - name: A name of the reduction operation. - compression: Compression algorithm used during allreduce to reduce the amount - of data sent during the each parameter update step. Defaults to - not using compression. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. - prescale_factor: Multiplicative factor to scale tensor before allreduce. - postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A tensor of the same shape and type as `tensor`, averaged or summed across all - processes. - """ - tensor_compressed, ctx = compression.compress(tensor) - summed_tensor_compressed = HorovodAllreduce.apply(tensor_compressed, average, name, op, - prescale_factor, postscale_factor, - process_set) - return compression.decompress(summed_tensor_compressed, ctx) - - -def allreduce_async_(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): - """ - A function that performs asynchronous in-place averaging or summation of the input - tensor over all the Horovod processes. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The reduction will not start until all processes - are ready to send and receive the tensor. - - Arguments: - tensor: A tensor to reduce. - average: - .. warning:: .. deprecated:: 0.19.0 - - Use `op` instead. Will be removed in v1.0. - - name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. - prescale_factor: Multiplicative factor to scale tensor before allreduce. - postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A handle to the allreduce operation that can be used with `poll()` or - `synchronize()`. - """ - op = handle_average_backwards_compatibility(op, average) - return _allreduce_async(tensor, tensor, name, op, prescale_factor, postscale_factor, process_set) - - -def allreduce_(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): - """ - A function that performs in-place averaging or summation of the input tensor over - all the Horovod processes. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The reduction will not start until all processes - are ready to send and receive the tensor. - - Arguments: - tensor: A tensor to reduce. - average: - .. warning:: .. deprecated:: 0.19.0 - - Use `op` instead. Will be removed in v1.0. - - name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. - prescale_factor: Multiplicative factor to scale tensor before allreduce. - postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A tensor of the same shape and type as `tensor`, averaged or summed across all - processes. - """ - handle = allreduce_async_(tensor, average, name, op, prescale_factor, postscale_factor, process_set) - return synchronize(handle) - - -def _grouped_allreduce_function_factory(tensor): - return 'horovod_torch_grouped_allreduce_async_' + tensor.type().replace('.', '_') - - -def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor, process_set: ProcessSet): - # Set the divisor for reduced gradients to average when necessary - if op == Average: - if rocm_built(): - # For ROCm, perform averaging at framework level - divisor = process_set.size() - op = Sum - else: - divisor = 1 - elif op == Adasum: - if process_set != global_process_set: - raise NotImplementedError("Adasum does not support non-global process sets yet.") - if tensors[0].device.type != 'cpu' and gpu_available('torch'): - if nccl_built(): - if not is_homogeneous(): - raise NotImplementedError('Running GPU Adasum on heterogeneous cluster is not supported yet.') - elif not num_rank_is_power_2(int(size() / local_size())): - raise NotImplementedError('Running GPU Adasum with non-power of 2 nodes is not supported yet.') - if rocm_built(): - # For ROCm, perform averaging at framework level - divisor = local_size() - else: - divisor = 1 - else: - warnings.warn('Adasum reduction does not currently support GPU reduction using MPI. Tensors are ' - 'copied to CPU memory instead. To use Adasum for GPU reduction, please compile Horovod ' - 'with HOROVOD_GPU_OPERATIONS=NCCL.') - divisor = 1 - else: - if not num_rank_is_power_2(size()): - raise NotImplementedError('Running Adasum with non-power of 2 ranks is not supported yet.') - divisor = 1 - else: - divisor = 1 - - function = _check_function(_grouped_allreduce_function_factory, tensors[0]) - try: - handle = getattr(mpi_lib, function)(tensors, outputs, divisor, - name.encode() if name is not None else _NULL, op, - prescale_factor, postscale_factor, process_set.process_set_id) - except RuntimeError as e: - raise HorovodInternalError(e) - _handle_map[handle] = (tuple(tensors), tuple(outputs)) - return handle - - -def grouped_allreduce_async(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): - """ - A function that performs asynchronous averaging or summation of the input tensor - list over all the Horovod processes. The input tensors are not modified. - - The reduction operations are keyed by the base name. If a base name is not - provided, an incremented auto-generated base name is used. Reductions are - performed across tensors in the same list position. The tensor type and - shape must be the same on all Horovod processes for tensors sharing - positions in the input tensor list. The reduction will not start until all - processes are ready to send and receive the tensors. - - Arguments: - tensors: A list of tensors to reduce. - average: - .. warning:: .. deprecated:: 0.19.0 - - Use `op` instead. Will be removed in v1.0. - - name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. - prescale_factor: Multiplicative factor to scale tensor before allreduce. - postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A handle to the group allreduce operation that can be used with `poll()` or - `synchronize()`. - """ - op = handle_average_backwards_compatibility(op, average) - outputs = [t.new(t.shape) for t in tensors] - return _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor, process_set) - - -class HorovodGroupedAllreduce(torch.autograd.Function): - """An autograd function that performs allreduce on a list of tensors.""" - - @staticmethod - def forward(ctx, average, name, op, prescale_factor, postscale_factor, process_set: ProcessSet, *tensors): - ctx.average = average - ctx.op = op - ctx.prescale_factor = prescale_factor - ctx.postscale_factor = postscale_factor - ctx.process_set = process_set - handle = grouped_allreduce_async(list(tensors), average, name, op, prescale_factor, postscale_factor, - process_set) - return synchronize(handle) - - @staticmethod - def backward(ctx, *grad_output): - grad_reduced = grouped_allreduce(list(grad_output), average=ctx.average, op=ctx.op, - prescale_factor=ctx.prescale_factor, - postscale_factor=ctx.postscale_factor, - process_set=ctx.process_set) - return (None, None, None, None, None, None, *grad_reduced) - - -def grouped_allreduce(tensors, average=None, name=None, compression=Compression.none, op=None, - prescale_factor=1.0, postscale_factor=1.0, process_set=global_process_set): - """ - A function that performs averaging or summation of the input tensor - list over all the Horovod processes. The input tensors are not modified. - - The reduction operations are keyed by the base name. If a base name is not - provided, an incremented auto-generated base name is used. Reductions are - performed across tensors in the same list position. The tensor type and - shape must be the same on all Horovod processes for tensors sharing - positions in the input tensor list. The reduction will not start until all - processes are ready to send and receive the tensors. - - This acts as a thin wrapper around an autograd function. If your input - tensors require gradients, then calling this function will allow gradients - to be computed and backpropagated. - - Arguments: - tensors: A list of tensors to reduce. - average: - .. warning:: .. deprecated:: 0.19.0 - - Use `op` instead. Will be removed in v1.0. - - name: A base name to use for the group reduction operation. - compression: Compression algorithm used during allreduce to reduce the amount - of data sent during the each parameter update step. Defaults to - not using compression. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. - prescale_factor: Multiplicative factor to scale tensor before allreduce. - postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A list containing tensors of the same shape and type as in `tensors`, - averaged or summed across all processes. - """ - tensors_compressed, ctxs = zip(*[compression.compress(t) for t in tensors]) - summed_tensors_compressed = HorovodGroupedAllreduce.apply(average, name, op, - prescale_factor, postscale_factor, - process_set, *tensors_compressed) - return [compression.decompress(t, ctx) for t, ctx in zip(summed_tensors_compressed, ctxs)] - - -def grouped_allreduce_async_(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): - """ - A function that performs asynchronous in-place averaging or summation of the input - tensors over all the Horovod processes. - - The reduction operations are keyed by the base name. If a base name is not - provided, an incremented auto-generated base name is used. Reductions are - performed across tensors in the same list position. The tensor type and - shape must be the same on all Horovod processes for tensors sharing - positions in the input tensor list. The reduction will not start until all - processes are ready to send and receive the tensors. - - Arguments: - tensors: A list of tensors to reduce. - average: - .. warning:: .. deprecated:: 0.19.0 - - Use `op` instead. Will be removed in v1.0. - - name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. - prescale_factor: Multiplicative factor to scale tensor before allreduce. - postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A handle to the group allreduce operation that can be used with `poll()` or - `synchronize()`. - """ - op = handle_average_backwards_compatibility(op, average) - return _grouped_allreduce_async(tensors, tensors, name, op, prescale_factor, postscale_factor, process_set) - - -def grouped_allreduce_(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0, - process_set=global_process_set): - """ - A function that performs in-place averaging or summation of the input tensors over - all the Horovod processes. - - The reduction operations are keyed by the base name. If a base name is not - provided, an incremented auto-generated base name is used. Reductions are - performed across tensors in the same list position. The tensor type and - shape must be the same on all Horovod processes for tensors sharing - positions in the input tensor list. The reduction will not start until all - processes are ready to send and receive the tensors. - - Arguments: - tensors: A list of tensors to reduce. - average: - .. warning:: .. deprecated:: 0.19.0 - - Use `op` instead. Will be removed in v1.0. - - name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. - Supported op values are Sum, Average, Min, Max, and Product. Defaults - to Average if None is given. - prescale_factor: Multiplicative factor to scale tensor before allreduce. - postscale_factor: Multiplicative factor to scale tensor after allreduce. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A list containing tensors of the same shape and type as in `tensors`, - averaged or summed across all processes. - """ - handle = grouped_allreduce_async_(tensors, average, name, op, prescale_factor, postscale_factor, process_set) - return synchronize(handle) - - -def sparse_allreduce_async(tensor, name, op, process_set=global_process_set): - # Allgather aggregates along the first dimension, so we need to transpose the - # indices to enforce correct concatenation behavior, then transpose back prior to - # constructing the new aggregated sparse gradient - t = tensor - indices_handle = allgather_async(t._indices().transpose(0, 1).contiguous(), name=f'{name}.indices', - process_set=process_set) - values_handle = allgather_async(t._values(), name=f'{name}.values', process_set=process_set) - - def handle(): - # We need to sync values handle firstly for torch nightly >= 10.0 - # Issue: https://github.com/horovod/horovod/issues/2961 - values = synchronize(values_handle) - indices = synchronize(indices_handle) - - values = (values / process_set.size()) if op == Average else values - - if indices.dim() == 0 or values.dim() == 0: - return t.new().resize_as_(t) - return t.new(indices.transpose(0, 1), values, t.size()) - - return handle - - -def _allgather_function_factory(tensor): - return 'horovod_torch_allgather_async_' + tensor.type().replace('.', '_') - - -def _allgather_async(tensor, output, name, process_set: ProcessSet): - function = _check_function(_allgather_function_factory, tensor) - try: - handle = getattr(mpi_lib, function)( - tensor, output, name.encode() if name is not None else _NULL, - process_set.process_set_id) - except RuntimeError as e: - raise HorovodInternalError(e) - _handle_map[handle] = (tensor, output) - return handle - - -def allgather_async(tensor, name=None, process_set=global_process_set): - """ - A function that asynchronously concatenates the input tensor with the same input - tensor on all other Horovod processes. The input tensor is not modified. - - The concatenation is done on the first dimension, so the input tensors on the - different processes must have the same rank and shape, except for the first - dimension, which is allowed to be different. - - Arguments: - tensor: A tensor to allgather. - name: A name of the allgather operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A handle to the allgather operation that can be used with `poll()` or - `synchronize()`. - """ - output = tensor.new() - return _allgather_async(tensor, output, name, process_set) - - -class HorovodAllgather(torch.autograd.Function): - """An autograd function that performs allgather on a tensor.""" - - @staticmethod - def forward(ctx, tensor, name, process_set: ProcessSet): - ctx.dim = tensor.shape[0] - ctx.process_set = process_set - handle = allgather_async(tensor, name, process_set) - return synchronize(handle) - - @staticmethod - def backward(ctx, grad_output): - grad_reduced = allreduce(grad_output, average=False, process_set=ctx.process_set)#average CHANGE - - dim_t = torch.IntTensor([ctx.dim]) - dim = allgather(dim_t, process_set=ctx.process_set).view(ctx.process_set.size()) - - r = ctx.process_set.rank() - offset = torch.sum(dim.narrow(0, 0, r)).item() if r != 0 else 0 - return grad_reduced.narrow(0, offset, ctx.dim), None, None - - -def allgather(tensor, name=None, process_set=global_process_set): - """ - A function that concatenates the input tensor with the same input tensor on - all other Horovod processes. The input tensor is not modified. - - The concatenation is done on the first dimension, so the corresponding - input tensors on the different processes must have the same rank and - shape, except for the first dimension, which is allowed to be different. - - This acts as a thin wrapper around an autograd function. If your input - tensor requires gradients, then callings this function will allow gradients - to be computed and backpropagated. - - Arguments: - tensor: A tensor to allgather. - name: A name of the allgather operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A tensor of the same type as `tensor`, concatenated on dimension zero - across all processes. The shape is identical to the input shape, except for - the first dimension, which may be greater and is the sum of all first - dimensions of the tensors in different Horovod processes. - """ - return HorovodAllgather.apply(tensor, name, process_set) - - -def _grouped_allgather_function_factory(tensor): - return 'horovod_torch_grouped_allgather_async_' + tensor.type().replace('.', '_') - - -def _grouped_allgather_async(tensors, outputs, name, process_set: ProcessSet): - function = _check_function(_grouped_allgather_function_factory, tensors[0]) - try: - handle = getattr(mpi_lib, function)( - tensors, outputs, name.encode() if name is not None else _NULL, - process_set.process_set_id) - except RuntimeError as e: - raise HorovodInternalError(e) - _handle_map[handle] = (tuple(tensors), tuple(outputs)) - return handle - - -def grouped_allgather_async(tensors, name=None, process_set=global_process_set): - """ - A function that asynchronously concatenates each input tensor with the corresponding - input tensor on all other Horovod processes for a list of input tensors. The input - tensors are not modified. - - The concatenation is done on the first dimension, so the input tensors on the - different processes must have the same rank and shape, except for the first - dimension, which is allowed to be different. - - Arguments: - tensors: A list of tensors to allgather. - name: A base name to use for the group allgather operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A handle to the group allgather operation that can be used with `poll()` or - `synchronize()`. - """ - outputs = [t.new() for t in tensors] - return _grouped_allgather_async(tensors, outputs, name, process_set) - - -class HorovodGroupedAllgather(torch.autograd.Function): - """An autograd function that performs allgather on a list of tensors.""" - - @staticmethod - def forward(ctx, name, process_set: ProcessSet, *tensors): - ctx.dims = [t.shape[0] for t in tensors] - ctx.process_set = process_set - handle = grouped_allgather_async(list(tensors), name, process_set) - return synchronize(handle) - - @staticmethod - def backward(ctx, *grad_output): - grad_reduced = grouped_allreduce(list(grad_output), average=True, process_set=ctx.process_set) - - dim_ts = [torch.IntTensor([dim]) for dim in ctx.dims] - dims = [dt.view(ctx.process_set.size()) - for dt in grouped_allgather(dim_ts, process_set=ctx.process_set)] - - r = ctx.process_set.rank() - offsets = [torch.sum(dim.narrow(0, 0, r)).item() if r != 0 else 0 - for dim in dims] - result = [gr.narrow(0, offset, dim) - for gr, offset, dim in zip(grad_reduced, offsets, ctx.dims)] - return (None, None, *result) - - -def grouped_allgather(tensors, name=None, process_set=global_process_set): - """ - A function that concatenates each input tensor with the corresponding - input tensor on all other Horovod processes for a list of input tensors. - The input tensors are not modified. - - The concatenation is done on the first dimension, so the corresponding input - tensors on the different processes must have the same rank and shape, except - for the first dimension, which is allowed to be different. - - Arguments: - tensors: A list of tensors to allgather. - name: A base name to use for the group allgather operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A list containing tensors of the same type as in `tensors`. Each tensor - is concatenated on dimension zero across all processes. Its shape is - identical to the corresponding input shape, expect for the first - dimension, which may be greater and is the sum of all first dimensions - of the corresponding tensor in different Horovod processes. - """ - return HorovodGroupedAllgather.apply(name, process_set, *tensors) - - -def _broadcast_function_factory(tensor): - return 'horovod_torch_broadcast_async_' + tensor.type().replace('.', '_') - - -def _broadcast_async(tensor, output, root_rank, name, process_set: ProcessSet): - function = _check_function(_broadcast_function_factory, tensor) - try: - handle = getattr(mpi_lib, function)( - tensor, output, root_rank, name.encode() if name is not None else _NULL, - process_set.process_set_id) - except RuntimeError as e: - raise HorovodInternalError(e) - _handle_map[handle] = (tensor, output) - return handle - - -def broadcast_async(tensor, root_rank, name=None, process_set=global_process_set): - """ - A function that asynchronously broadcasts the input tensor on root rank to the same - input tensor on all other Horovod processes. The input tensor is not modified. - - The broadcast operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The broadcast will not start until all processes - are ready to send and receive the tensor. - - Arguments: - tensor: A tensor to broadcast. - root_rank: The rank to broadcast the value from. - name: A name of the broadcast operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A handle to the broadcast operation that can be used with `poll()` or - `synchronize()`. - """ - output = tensor.new(tensor.shape) - return _broadcast_async(tensor, output, root_rank, name, process_set) - - -class HorovodBroadcast(torch.autograd.Function): - """An autograd function that broadcasts a tensor.""" - - @staticmethod - def forward(ctx, tensor, root_rank, name, process_set: ProcessSet): - ctx.root_rank = root_rank - ctx.process_set = process_set - handle = broadcast_async(tensor, root_rank, name, process_set) - return synchronize(handle) - - @staticmethod - def backward(ctx, grad_output): - grad_reduced = allreduce(grad_output, average=False, process_set=ctx.process_set)#True CHANGED - if rank() != ctx.root_rank: - grad_reduced *= 0 - return grad_reduced, None, None, None - - -def broadcast(tensor, root_rank, name=None, process_set=global_process_set): - """ - A function that broadcasts the input tensor on root rank to the same input tensor - on all other Horovod processes. The input tensor is not modified. - - The broadcast operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The broadcast will not start until all processes - are ready to send and receive the tensor. - - This acts as a thin wrapper around an autograd function. If your input - tensor requires gradients, then callings this function will allow gradients - to be computed and backpropagated. - - Arguments: - tensor: A tensor to broadcast. - root_rank: The rank to broadcast the value from. - name: A name of the broadcast operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A tensor of the same shape and type as `tensor`, with the value broadcasted - from root rank. - """ - return HorovodBroadcast.apply(tensor, root_rank, name, process_set) - - -def broadcast_async_(tensor, root_rank, name=None, process_set=global_process_set): - """ - A function that asynchronously broadcasts the input tensor on root rank to the same - input tensor on all other Horovod processes. The operation is performed in-place. - - The broadcast operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The broadcast will not start until all processes - are ready to send and receive the tensor. - - Arguments: - tensor: A tensor to broadcast. - root_rank: The rank to broadcast the value from. - name: A name of the broadcast operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A handle to the broadcast operation that can be used with `poll()` or - `synchronize()`. - """ - return _broadcast_async(tensor, tensor, root_rank, name, process_set) - - -def broadcast_(tensor, root_rank, name=None, process_set=global_process_set): - """ - A function that broadcasts the input tensor on root rank to the same input tensor - on all other Horovod processes. The operation is performed in-place. - - The broadcast operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The broadcast will not start until all processes - are ready to send and receive the tensor. - - Arguments: - tensor: A tensor to broadcast. - root_rank: The rank to broadcast the value from. - name: A name of the broadcast operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A tensor of the same shape and type as `tensor`, with the value broadcasted - from root rank. - """ - handle = broadcast_async_(tensor, root_rank, name, process_set) - return synchronize(handle) - -def _alltoall_function_factory(tensor): - return 'horovod_torch_alltoall_async_' + tensor.type().replace('.', '_') - -def _alltoall_async(tensor, splits, output, output_received_splits, name, process_set: ProcessSet): - if splits is None: - # If splits not provided, create empty tensor as placeholder - splits = torch.tensor([], dtype=torch.int32, device='cpu') - elif not isinstance(splits, torch.Tensor): - splits = torch.tensor(splits, dtype=torch.int32, device='cpu') - function = _check_function(_alltoall_function_factory, tensor) - try: - handle = getattr(mpi_lib, function)( - tensor, splits, output, output_received_splits, name.encode() if name is not None else _NULL, - process_set.process_set_id) - except RuntimeError as e: - raise HorovodInternalError(e) - _handle_map[handle] = (tensor, splits, (output, output_received_splits)) - return handle - - -def alltoall_async(tensor, splits=None, name=None, process_set=global_process_set): - """ - A function that scatters slices of the input tensor to all other Horovod processes - and returns a tensor of gathered slices from all other Horovod processes. The input - tensor is not modified. - - The slicing is done on the first dimension, so the input tensors on - the different processes must have the same rank and shape, except for the - first dimension, which is allowed to be different. - - Arguments: - tensor: A tensor to distribute with alltoall. - splits: A tensor of integers in rank order describing how many - elements in `tensor` to send to each worker. Splitting is - applied along the first dimension of `tensor`. If `splits` is - not provided, the first dimension is split equally by the - number of Horovod processes. - name: A name of the alltoall operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - A handle to the alltoall operation that can be used with `poll()` or - `synchronize()`. - """ - output = tensor.new() - if isinstance(splits, torch.Tensor): - output_received_splits = splits.new() - else: - output_received_splits = torch.empty(size(), dtype=torch.int32, device='cpu') - return _alltoall_async(tensor, splits, output, output_received_splits, name, process_set) - - -class HorovodAlltoall(torch.autograd.Function): - """An autograd function that performs alltoall on a tensor.""" - - @staticmethod - def forward(ctx, tensor, splits, name, process_set: ProcessSet): - handle = alltoall_async(tensor, splits, name, process_set) - output, received_splits = synchronize(handle) - - ctx.process_set = process_set - ctx.recvsplits = received_splits - if splits is None: - return output - else: - ctx.mark_non_differentiable(received_splits) - return output, received_splits - - @staticmethod - def backward(ctx, grad_output, *dead_gradients): - grad_wrt_tensor, _ = alltoall(grad_output, splits=ctx.recvsplits, - process_set=ctx.process_set) - return grad_wrt_tensor, None, None, None - - -def alltoall(tensor, splits=None, name=None, process_set=global_process_set): - """ - A function that scatters slices of the input tensor to all other Horovod processes - and returns a tensor of gathered slices from all other Horovod processes. The input - tensor is not modified. - - The slicing is done on the first dimension, so the input tensors on - the different processes must have the same rank and shape, except for the - first dimension, which is allowed to be different. - - This acts as a thin wrapper around an autograd function. If your input - tensor requires gradients, then callings this function will allow gradients - to be computed and backpropagated. - - Arguments: - tensor: A tensor to distribute with alltoall. - splits: A tensor of integers in rank order describing how many - elements in `tensor` to send to each worker. Splitting is - applied along the first dimension of `tensor`. If `splits` is - not provided, the first dimension is split equally by the - number of Horovod processes. - name: A name of the alltoall operation. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - - Returns: - 1) A tensor containing the gathered tensor data from all workers. - 2) If `splits` has been provided: A tensor of integers in rank order - describing how many elements in the output tensor have been received - from each worker. - """ - return HorovodAlltoall.apply(tensor, splits, name, process_set) - - -def _reducescatter_function_factory(tensor): - return 'horovod_torch_reducescatter_async_' + tensor.type().replace('.', '_') - - -def _reducescatter_async(tensor, output, name, op, process_set: ProcessSet, - prescale_factor, postscale_factor): - function = _check_function(_reducescatter_function_factory, tensor) - try: - handle = getattr(mpi_lib, function)(tensor, output, - name.encode() if name is not None else _NULL, - op, process_set.process_set_id, - prescale_factor, postscale_factor) - except RuntimeError as e: - raise HorovodInternalError(e) - _handle_map[handle] = (tensor, output) - return handle - - -def reducescatter_async(tensor, name=None, op=Average, process_set=global_process_set, - prescale_factor=1.0, postscale_factor=1.0): - """ - A function that performs asynchronous reduction of the input tensor over all the - Horovod processes, then scatters the results across all Horovod processes. The - input tensor is not modified. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The reduction will not start until all processes - are ready to send and receive the tensor. - - The input tensors on the different processes must have the same rank and shape. The - output tensor will be the same rank on all processes, but the first dimension may - be different. - - Arguments: - tensor: A tensor to average and sum. - name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. - Defaults to Average. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - prescale_factor: Multiplicative factor to scale tensor before reducescatter. - postscale_factor: Multiplicative factor to scale tensor after reducescatter. - - Returns: - A handle to the reducescatter operation that can be used with `poll()` or - `synchronize()`. - """ - output = tensor.new() - return _reducescatter_async(tensor, output, name, op, process_set, - prescale_factor, postscale_factor) - - -class HorovodReducescatter(torch.autograd.Function): - """An autograd function that performs reducescatter on a tensor.""" - - @staticmethod - def forward(ctx, tensor, name, op, process_set, prescale_factor, postscale_factor): - ctx.op = op - ctx.process_set = process_set - ctx.prescale_factor = prescale_factor - ctx.postscale_factor = postscale_factor - handle = reducescatter_async(tensor, name, op, process_set, prescale_factor, postscale_factor) - return synchronize(handle) - - @staticmethod - def backward(ctx, grad_output): - if ctx.op == Sum: - grad_output *= ctx.process_set.size() - if ctx.prescale_factor != 1.0: - grad_output *= ctx.prescale_factor - if ctx.postscale_factor != 1.0: - grad_output *= ctx.postscale_factor - - return allgather(grad_output, process_set=ctx.process_set), None, None, None, None, None - - -def reducescatter(tensor, name=None, compression=Compression.none, op=Average, - process_set=global_process_set, prescale_factor=1.0, postscale_factor=1.0): - """ - A function that performs reduction of the input tensor over all the Horovod - processes, then scatters the results across all Horovod processes. The input tensor - is not modified. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The reduction will not start until all processes - are ready to send and receive the tensor. - - Arguments: - tensor: A tensor to average/sum and scatter. - name: A name of the reduction operation. - compression: Compression algorithm used during reducescatter to reduce the amount - of data sent during the each parameter update step. Defaults to - not using compression. - op: The reduction operation to combine tensors across different ranks. - Defaults to Average. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - prescale_factor: Multiplicative factor to scale tensor before reducescatter. - postscale_factor: Multiplicative factor to scale tensor after reducescatter. - - Returns: - A tensor of the same rank and type as `tensor` across all processes. The shape - is identical to the input shape, except for the first dimension, which will be - divided across the different Horovod processes. - """ - tensor_compressed, ctx = compression.compress(tensor) - reduced_tensor_compressed = HorovodReducescatter.apply(tensor_compressed, name, op, process_set, - prescale_factor, postscale_factor) - return compression.decompress(reduced_tensor_compressed, ctx) - - -def _grouped_reducescatter_function_factory(tensor): - return 'horovod_torch_grouped_reducescatter_async_' + tensor.type().replace('.', '_') - - -def _grouped_reducescatter_async(tensors, outputs, name, op, process_set: ProcessSet, - prescale_factor, postscale_factor): - function = _check_function(_grouped_reducescatter_function_factory, tensors[0]) - try: - handle = getattr(mpi_lib, function)(tensors, outputs, - name.encode() if name is not None else _NULL, - op, process_set.process_set_id, - prescale_factor, postscale_factor) - except RuntimeError as e: - raise HorovodInternalError(e) - _handle_map[handle] = (tuple(tensors), tuple(outputs)) - return handle - - -def grouped_reducescatter_async(tensors, name=None, op=Average, process_set=global_process_set, - prescale_factor=1.0, postscale_factor=1.0): - """ - A function that performs asynchronous reduction of a list of input tensors over all the - Horovod processes, then scatters the results across all Horovod processes. The - input tensors are not modified. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. For each of the input tensors, the type and shape must - be the same on all Horovod processes for a given name. The reduction will not start - until all processes are ready to send and receive the tensors. - - The input tensor at some place in the list tensor must have the same rank and shape - on the different processes. The corresponding output tensor will be the same rank on - all processes, but the first dimension may be different. - - Arguments: - tensors: A list of tensors to average and sum. - name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. - Defaults to Average. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - prescale_factor: Multiplicative factor to scale tensors before reducescatter. - postscale_factor: Multiplicative factor to scale tensors after reducescatter. - - Returns: - A handle to the group reducescatter operation that can be used with `poll()` or - `synchronize()`. - """ - outputs = [t.new() for t in tensors] - return _grouped_reducescatter_async(tensors, outputs, name, op, process_set, - prescale_factor, postscale_factor) - - -class HorovodGroupedReducescatter(torch.autograd.Function): - """An autograd function that performs reducescatter on a list of tensors.""" - - @staticmethod - def forward(ctx, name, op, process_set, prescale_factor, postscale_factor, *tensors): - ctx.op = op - ctx.process_set = process_set - ctx.prescale_factor = prescale_factor - ctx.postscale_factor = postscale_factor - handle = grouped_reducescatter_async(list(tensors), name, op, process_set, - prescale_factor, postscale_factor, ) - return synchronize(handle) - - @staticmethod - def backward(ctx, *grad_output): - if ctx.op == Sum: - grad_output = [g * ctx.process_set.size() for g in grad_output] - if ctx.prescale_factor != 1.0: - grad_output = [ctx.prescale_factor * g for g in grad_output] - if ctx.postscale_factor != 1.0: - grad_output = [ctx.postscale_factor * g for g in grad_output] - - return (None, None, None, None, None, - *grouped_allgather(grad_output, process_set=ctx.process_set)) - - -def grouped_reducescatter(tensors, name=None, compression=Compression.none, op=Average, - process_set=global_process_set, prescale_factor=1.0, postscale_factor=1.0): - """ - A function that performs reduction of a list of input tensors over all the - Horovod processes, then scatters the results across all Horovod processes. The - input tensors are not modified. - - The reduction operation is keyed by the name. If name is not provided, an incremented - auto-generated name is used. The tensor type and shape must be the same on all - Horovod processes for a given name. The reduction will not start until all processes - are ready to send and receive the tensor. - - Arguments: - tensors: A list of tensors to average and sum. - name: A base name to use for the group reduction operation. - compression: Compression algorithm used during reducescatter to reduce the amount - of data sent during the each parameter update step. Defaults to - not using compression. - op: The reduction operation to combine tensors across different ranks. - Defaults to Average. - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - prescale_factor: Multiplicative factor to scale tensors before reducescatter. - postscale_factor: Multiplicative factor to scale tensors after reducescatter. - - - Returns: - A list containing tensors of the same rank and type as in `tensors`. For each - tensor the shape is identical to the input shape, except for the first dimension, - which will be divided across the different Horovod processes. - """ - tensors_compressed = [] - ctxs = [] - for tensor in tensors: - tensor_compressed, ctx = compression.compress(tensor) - tensors_compressed.append(tensor_compressed) - ctxs.append(ctx) - reduced_tensors_compressed = HorovodGroupedReducescatter.apply(name, op, process_set, - prescale_factor, postscale_factor, - *tensors_compressed) - return [compression.decompress(reduced_tensor_compressed, ctx) - for reduced_tensor_compressed, ctx in zip(reduced_tensors_compressed, ctxs)] - - -def poll(handle): - """ - Polls an allreduce, allgather, alltoall, broadcast, or reducescatter handle to determine whether - underlying asynchronous operation has completed. After `poll()` returns `True`, - `synchronize()` will return without blocking. - - Arguments: - handle: A handle returned by an allreduce, allgather, alltoall, broadcast, or reducescatter - asynchronous operation. - - Returns: - A flag indicating whether the operation has completed. - """ - return mpi_lib.horovod_torch_poll(handle) != 0 - - -def synchronize(handle): - """ - Synchronizes an asynchronous allreduce, allgather, alltoall, broadcast, or reducescatter operation - until it's completed. Returns the result of the operation. - - Arguments: - handle: A handle returned by an allreduce, allgather, alltoall, broadcast, or reducescatter - asynchronous operation. - - Returns: - A single output tensor of the operation or a tuple of multiple output tensors. - """ - if handle not in _handle_map: - return - - try: - mpi_lib.horovod_torch_wait_and_clear(handle) - output = _handle_map.pop(handle)[-1] - return output - except RuntimeError as e: - _handle_map.pop(handle, None) - raise HorovodInternalError(e) - - -def join(device=-1) -> int: - """A function that indicates that the rank finished processing data. - - All ranks that did not call join() continue to process allreduce operations. - This function blocks Python thread until all ranks join. - - Arguments: - device: An id of the device to create temprorary zero tensors (default -1, CPU) - - Returns: - Id of the rank that joined last. - """ - output = torch.tensor(-1, dtype=torch.int, device=torch.device("cpu")) - try: - handle = mpi_lib.horovod_torch_join(output, device) - except RuntimeError as e: - raise HorovodInternalError(e) - - _handle_map[handle] = (None, output) - - return synchronize(handle).item() - -def barrier(process_set=global_process_set): - """ - A function that acts as a simple sychronization point for ranks specified - in the given process group(default to global group). Ranks that reach - this function call will stall until all other ranks have reached. - - Arguments: - process_set: Process set object to limit this operation to a subset of - Horovod processes. Default is the global process set. - """ - - try: - handle = mpi_lib.horovod_torch_barrier(process_set.process_set_id) - except RuntimeError as e: - raise HorovodInternalError(e) - - _handle_map[handle] = (None, None) - - synchronize(handle) diff --git a/untitled folder/patch_files/horovod/torch/optimizer.py b/untitled folder/patch_files/horovod/torch/optimizer.py deleted file mode 100644 index 7e3f3b1..0000000 --- a/untitled folder/patch_files/horovod/torch/optimizer.py +++ /dev/null @@ -1,618 +0,0 @@ -# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. -# Modifications copyright Microsoft -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import os -import warnings - -from contextlib import contextmanager - -import torch - -from horovod.common.util import split_list - -from horovod.torch.compression import Compression -from horovod.torch.functions import broadcast_object -from horovod.torch.mpi_ops import allreduce_async_, grouped_allreduce_async_, sparse_allreduce_async -from horovod.torch.mpi_ops import synchronize -from horovod.torch.mpi_ops import size -from horovod.torch.mpi_ops import Average, Adasum, Sum -from horovod.torch.mpi_ops import rocm_built -from horovod.torch.mpi_ops import ProcessSet, global_process_set - - -class _DistributedOptimizer(torch.optim.Optimizer): - def __init__(self, params, named_parameters,grace, compression, - backward_passes_per_step=1, op=Average, - gradient_predivide_factor=1.0, - groups=None, - sparse_as_dense=False, - process_set=global_process_set): - super(self.__class__, self).__init__(params) - self._grace = grace - self._process_set=process_set - self._compression = compression - - if named_parameters is not None: - named_parameters = list(named_parameters) - else: - named_parameters = [(f'allreduce.noname.{i}.{j}', v) - for i, param_group in enumerate(self.param_groups) - for j, v in enumerate(param_group['params'])] - # make sure that named_parameters are tuples - if any([not isinstance(p, tuple) for p in named_parameters]): - raise ValueError('named_parameters should be a sequence of ' - 'tuples (name, parameter), usually produced by ' - 'model.named_parameters().') - - dups = _DistributedOptimizer.find_duplicates([k for k, _ in named_parameters]) - if len(dups) > 0: - raise ValueError('Parameter names in named_parameters must be unique. ' - 'Found duplicates: %s' % ', '.join(dups)) - - all_param_ids = {id(v) - for param_group in self.param_groups - for v in param_group['params']} - named_param_ids = {id(v) for k, v in named_parameters} - unnamed_param_ids = all_param_ids - named_param_ids - if len(unnamed_param_ids): - raise ValueError('named_parameters was specified, but one or more model ' - 'parameters were not named. Python object ids: ' - '%s' % ', '.join(str(id) for id in unnamed_param_ids)) - - self._parameter_names = {v: k for k, v in sorted(named_parameters)} - self.backward_passes_per_step = backward_passes_per_step - self._allreduce_delay = {v: self.backward_passes_per_step - for _, v in sorted(named_parameters)} - self.op = op - self.gradient_predivide_factor = gradient_predivide_factor - self.sparse_as_dense = sparse_as_dense - self.process_set = process_set - - self._handles = {} - self._grad_accs = [] - self._requires_update = set() - self._synchronized = False - self._should_synchronize = True - - if groups is not None: - if not (isinstance(groups, list) or groups > 0): - raise ValueError('groups should be a non-negative integer or ' - 'a list of list of torch.Tensor.') - if isinstance(groups, list): - grouped_parameter_ids = set() - for l in groups: - for p in l: - if not isinstance(p, torch.Tensor): - raise ValueError('groups must consist of torch.Tensor.') - if id(p) in grouped_parameter_ids: - raise ValueError('A parameter can only appear once in groups.') - grouped_parameter_ids.add(id(p)) - self._groups = groups - self._p_to_group = {} - self._group_counts = {} - - if self.process_set.included() and (size() > 1 or os.environ.get('HOROVOD_ELASTIC') == '1'): - self._register_hooks() - - def load_state_dict(self, *args, **kwargs): - self._handles = {} - self._synchronized = False - self._should_synchronize = True - for p in self._allreduce_delay: - self._allreduce_delay[p] = self.backward_passes_per_step - super(self.__class__, self).load_state_dict(*args, **kwargs) - - @staticmethod - def find_duplicates(lst): - seen = set() - dups = set() - for el in lst: - if el in seen: - dups.add(el) - seen.add(el) - return dups - - def set_backward_passes_per_step(self, passes): - self.backward_passes_per_step = passes - for p in self._allreduce_delay: - self._allreduce_delay[p] = self.backward_passes_per_step - - def _register_hooks(self): - if self._groups is not None: - p_list = [] - # Get list of parameters with grads - for param_group in self.param_groups: - for p in param_group['params']: - if p.requires_grad: - p_list.append(p) - - # To ensure parameter order and group formation is consistent, broadcast p_list order - # from rank 0 and use for every worker - p_list_names = [self._parameter_names.get(p) for p in p_list] - p_list_names = broadcast_object(p_list_names, root_rank=0, process_set=self.process_set) - p_list = sorted(p_list, key=lambda p: p_list_names.index(self._parameter_names.get(p))) - - # Form groups - if isinstance(self._groups, list): - p_groups = [] - grouped_id = set() - p_list_ids = [id(p) for p in p_list] - for group in self._groups: - p_groups.append([p for p in group if id(p) in p_list_ids]) - for p in p_groups[-1]: - grouped_id.add(id(p)) - for p in p_list: - if id(p) not in grouped_id: - p_groups.append([p]) - else: - p_groups = split_list(p_list, self._groups) - - p_groups = [tuple(p) for p in p_groups] - for group in p_groups: - for p in group: - self._p_to_group[p] = group - self._group_counts[group] = 0 - - for param_group in self.param_groups: - for p in param_group['params']: - if p.requires_grad: - p.grad = p.data.new(p.size()).zero_() - self._requires_update.add(p) - p_tmp = p.expand_as(p) - grad_acc = p_tmp.grad_fn.next_functions[0][0] - grad_acc.register_hook(self._make_hook(p)) - self._grad_accs.append(grad_acc) - - def _allreduce_grad_async(self, p): - if p.grad is None: - # Gradient was not computed, but we still need to submit a tensor to allreduce - # as one of the other ranks may have computed it (due to dynamic forward functions). - # - # NOTE: this will not work if the gradient is sparse and we perform an allgather. - # Unfrotunately, there doesn't appear to be a good way to detect that the parameter will - # produce sparse gradients before computing the gradient. - p.grad = p.data.new(p.size()).zero_() - - name = self._parameter_names.get(p) - tensor = p.grad - - if p.grad.is_sparse: - if self.sparse_as_dense: - tensor = tensor.to_dense() - else: - return self._sparse_allreduce_grad_async(p, name) - if self._grace and self._groups is None and self.op == Average: - handle, ctx = self._grace.send_step(tensor, name,self._process_set) - postscale_factor = self.gradient_predivide_factor - else: - - tensor_compressed, ctx = self._compression.compress(tensor) - - if self.op == Average: - # Split average operation across pre/postscale factors - # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. - prescale_factor = 1.0 / self.gradient_predivide_factor - postscale_factor = self.gradient_predivide_factor - else: - prescale_factor = 1.0 - postscale_factor = 1.0 - - handle = allreduce_async_(tensor_compressed, name=name, op=self.op, - prescale_factor=prescale_factor, - postscale_factor=postscale_factor, - process_set=self.process_set) - return handle, ctx - - def _grouped_allreduce_grad_async(self, ps): - name = self._parameter_names.get(ps[0]) - tensors_compressed, ctxs = zip(*[self._compression.compress(p.grad) for p in ps]) - - handle = grouped_allreduce_async_(tensors_compressed, name=name, op=self.op, - process_set=self.process_set) - return handle, ctxs - - def _sparse_allreduce_grad_async(self, p, name): - handle = sparse_allreduce_async(p.grad, name=name, op=self.op, - process_set=self.process_set) - return handle, None - - def _make_hook(self, p): - def hook(*ignore): - if p in self._handles and self._handles[p][0] is not None: - if self._allreduce_delay[p] <= 0: - raise AssertionError( - "Gradients were computed more than " - "backward_passes_per_step times before call " - "to step(). Increase backward_passes_per_step to " - "accumulate gradients locally.") - assert not p.grad.requires_grad - assert self._allreduce_delay[p] > 0 - handle, ctx = None, None - self._allreduce_delay[p] -= 1 - if self._allreduce_delay[p] == 0: - if self._groups is not None: - group = self._p_to_group[p] - self._group_counts[group] += 1 - if self._group_counts[group] == len(group): - handle, ctxs = self._grouped_allreduce_grad_async(group) - self._handles[group] = (handle, ctxs) - # Remove any None entries from previous no-op hook calls - for gp in group: - self._handles.pop(gp, None) - self._group_counts[group] = 0 - return - else: - handle, ctx = self._allreduce_grad_async(p) - self._handles[p] = (handle, ctx) - return hook - - def synchronize(self): - if not self.process_set.included(): - self._synchronized = True - return - - completed = set() - for x in self._handles.keys(): - completed.update(x) if isinstance(x, tuple) else completed.add(x) - missing_p = self._requires_update - completed - for p in missing_p: - handle, ctx = self._allreduce_grad_async(p) - self._handles[p] = (handle, ctx) - - for p, (handle, ctx) in self._handles.items(): - if handle is None: - handle, ctx = self._allreduce_grad_async(p) - self._handles[p] = (handle, ctx) - for p, (handle, ctx) in self._handles.items(): - - if isinstance(p, tuple): - # This was a grouped result, need to unpack - outputs = synchronize(handle) - for gp, output, gctx in zip(p, outputs, ctx): - self._allreduce_delay[gp] = self.backward_passes_per_step - gp.grad.set_(self._compression.decompress(output, gctx)) - if self._groups is not None and self._group_counts[p] != 0: - self._group_counts[p] = 0 - else: - # When handle is a callable function, it returns the aggregated tensor result - if self._grace and self._groups is None and self.op == Average: - output = self._grace.receive_step(handle, ctx,self._process_set) - p.grad.set_(output) - else: - output = synchronize(handle) if not callable(handle) else handle() - self._allreduce_delay[p] = self.backward_passes_per_step - if self._groups is not None: - group = self._p_to_group[p] - if self._group_counts[group] != 0: - self._group_counts[group] = 0 - if p.grad.is_sparse: - aggregated = self._compression.decompress(output, ctx) - if not aggregated.is_sparse: - # When sparse_as_dense=True we need to convert the grad back to sparse before update - aggregated = aggregated.to_sparse() - - # Sparse grads do not support set_ for some reason, so we do this as an equivalent - p.grad.zero_().add_(aggregated) - p.grad.set_(self._compression.decompress(output, ctx)) - self._handles.clear() - - self._synchronized = True - - @contextmanager - def skip_synchronize(self): - """ - A context manager used to specify that optimizer.step() should - not perform synchronization. - - It's typically used in a following pattern: - - .. code-block:: python - - optimizer.synchronize() - with optimizer.skip_synchronize(): - optimizer.step() - """ - self._should_synchronize = False - try: - yield - finally: - self._should_synchronize = True - - def step(self, closure=None): - if self._should_synchronize: - if self._synchronized: - warnings.warn("optimizer.step() called without " - "optimizer.skip_synchronize() context after " - "optimizer.synchronize(). This can cause training " - "slowdown. You may want to consider using " - "optimizer.skip_synchronize() context if you use " - "optimizer.synchronize() in your code.") - self.synchronize() - self._synchronized = False - return super(self.__class__, self).step(closure) - - def zero_grad(self): - if self._handles: - raise AssertionError("optimizer.zero_grad() was called after loss.backward() " - "but before optimizer.step() or optimizer.synchronize(). " - "This is prohibited as it can cause a race condition.") - return super(self.__class__, self).zero_grad() - - -class _DistributedAdasumOptimizer(torch.optim.Optimizer): - def __init__(self, params, named_parameters, compression, - backward_passes_per_step=1): - super(self.__class__, self).__init__(params) - - self._compression = compression - - if named_parameters is not None: - named_parameters = list(named_parameters) - else: - named_parameters = [('allreduce.noname.%s' % i, v) - for param_group in self.param_groups - for i, v in enumerate(param_group['params'])] - - # make sure that named_parameters are tuples - if any([not isinstance(p, tuple) for p in named_parameters]): - raise ValueError('named_parameters should be a sequence of ' - 'tuples (name, parameter), usually produced by ' - 'model.named_parameters().') - - dups = _DistributedOptimizer.find_duplicates([k for k, _ in named_parameters]) - if len(dups) > 0: - raise ValueError('Parameter names in named_parameters must be unique. ' - 'Found duplicates: %s' % ', '.join(dups)) - - all_param_ids = {id(v) - for param_group in self.param_groups - for v in param_group['params']} - named_param_ids = {id(v) for k, v in named_parameters} - unnamed_param_ids = all_param_ids - named_param_ids - if len(unnamed_param_ids): - raise ValueError('named_parameters was specified, but one or more model ' - 'parameters were not named. Python object ids: ' - '%s' % ', '.join(str(id) for id in unnamed_param_ids)) - - self._parameter_names = {v: k for k, v in sorted(named_parameters)} - self.backward_passes_per_step = backward_passes_per_step - self._allreduce_delay = {v: self.backward_passes_per_step - for _, v in sorted(named_parameters)} - self._handles = {} - self._grad_accs = [] - self._requires_update = set() - self._synchronized = False - self._should_synchronize = True - - self._starting_models = { - p : torch.zeros_like(p, requires_grad=False) - for _, p in named_parameters - } - - self._register_hooks() - - def set_backward_passes_per_step(self, passes): - self.backward_passes_per_step = passes - for p in self._allreduce_delay: - self._allreduce_delay[p] = self.backward_passes_per_step - - def _register_hooks(self): - for param_group in self.param_groups: - for p in param_group['params']: - if p.requires_grad: - p.grad = p.data.new(p.size()).zero_() - self._requires_update.add(p) - p_tmp = p.expand_as(p) - grad_acc = p_tmp.grad_fn.next_functions[0][0] - grad_acc.register_hook(self._make_hook(p)) - self._grad_accs.append(grad_acc) - - def _allreduce_grad_async(self, p): - # Delta optimizer implements this logic: - # start = current.copy() - # step() -> computes 'current - \alpha.f(g)' where f is - # optimizer logic and g is the gradient - # delta = current-start - # allreduce_(delta) - # start += delta - # current = start - # In order to suppport this logic using function hook to improve performance, - # we do: - # delta = (start - \alpha.f(g)) - start - # = -\alpha.f(g) - # set start to zero and step computes -\alpha.f(g) - # where f is the underlying optimizer logic - - name = self._parameter_names.get(p) - start = self._starting_models[p] - - stashed_params = [] - for group in self.param_groups: - stashed_params.append(group['params']) - # only want to step on p - if any([p is v for v in group['params']]): - group['params'] = [p] - else: - group['params'] = [] - - start.data.copy_(p) - - super(self.__class__, self).step() - - # compute delta = curr - start - p.data.sub_(start) - - # allreduce as before - tensor_compressed, ctx = self._compression.compress(p) - handle = allreduce_async_(tensor_compressed.data, name=name, op=Adasum) - - # reset stashed parameters - for stashed, group in zip(stashed_params, self.param_groups): - group['params'] = stashed - - return handle, ctx - - def _make_hook(self, p): - def hook(*ignore): - if p in self._handles and self._handles[p][0] is not None: - if self._allreduce_delay[p] <= 0: - raise AssertionError( - "Gradients were computed more than " - "backward_passes_per_step times before call " - "to step(). Increase backward_passes_per_step to " - "accumulate gradients locally.") - assert not p.grad.requires_grad - assert self._allreduce_delay[p] > 0 - handle, ctx = None, None - self._allreduce_delay[p] -= 1 - if self._allreduce_delay[p] == 0: - handle, ctx = self._allreduce_grad_async(p) - self._handles[p] = (handle, ctx) - return hook - - def synchronize(self): - pass - - @contextmanager - def skip_synchronize(self): - raise AssertionError("Skipping synchronization is not supported when using Adasum optimizer.") - - def step(self, closure=None): - loss = None - if closure is not None: - loss = closure() - - missing_p = self._requires_update - set(self._handles.keys()) - for p in missing_p: - handle, ctx = self._allreduce_grad_async(p) - self._handles[p] = (handle, ctx) - - for p, (handle, ctx) in self._handles.items(): - # This means step() is called before backward_passes_per_steps finished. - # We do a synchoronous allreduce here. - if not handle: - handle, ctx = self._allreduce_grad_async(p) - self._handles[p] = (handle, ctx) - delta = synchronize(handle) - delta = self._compression.decompress(delta, ctx) - start = self._starting_models[p] - start.data.add_(delta.data) - p.data.copy_(start) - self._allreduce_delay[p] = self.backward_passes_per_step - self._handles.clear() - return loss - - def zero_grad(self): - if self._handles: - raise AssertionError("optimizer.zero_grad() was called after loss.backward() " - "but before optimizer.step() or optimizer.synchronize(). " - "This is prohibited as it can cause a race condition.") - return super(self.__class__, self).zero_grad() - - -def DistributedOptimizer(optimizer, named_parameters=None,grace=None, - compression=Compression.none, - backward_passes_per_step=1, - op=Average, - gradient_predivide_factor=1.0, - num_groups=0, groups=None, - sparse_as_dense=False, - process_set=global_process_set): - """ - An optimizer that wraps another torch.optim.Optimizer, using an allreduce to - combine gradient values before applying gradients to model weights. - - Allreduce operations are executed after each gradient is computed by ``loss.backward()`` - in parallel with each other. The ``step()`` method ensures that all allreduce operations are - finished before applying gradients to the model. - - DistributedOptimizer exposes the ``synchronize()`` method, which forces allreduce operations - to finish before continuing the execution. It's useful in conjunction with gradient - clipping, or other operations that modify gradients in place before ``step()`` is executed. - Make sure to use ``optimizer.skip_synchronize()`` if you're calling ``synchronize()`` - in your code. - - Example of gradient clipping: - - .. code-block:: python - - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.synchronize() - torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) - with optimizer.skip_synchronize(): - optimizer.step() - - Arguments: - optimizer: Optimizer to use for computing gradients and applying updates. - named_parameters: A mapping between parameter names and values. Used for naming of - allreduce operations. Typically just ``model.named_parameters()``. - compression: Compression algorithm used during allreduce to reduce the amount - of data sent during the each parameter update step. Defaults to - not using compression. - backward_passes_per_step: Number of expected backward passes to perform - before calling step()/synchronize(). This - allows accumulating gradients over multiple - mini-batches before reducing and applying them. - op: The reduction operation to use when combining gradients across different ranks. - gradient_predivide_factor: If op == Average, gradient_predivide_factor splits the averaging - before and after the sum. Gradients are scaled by - 1.0 / gradient_predivide_factor before the sum and - gradient_predivide_factor / size after the sum. - num_groups: Number of groups to assign gradient allreduce ops to for explicit - grouping. Defaults to no explicit groups. - groups: The parameter to group the gradient allreduce ops. Accept values is a - non-negative integer or a list of list of torch.Tensor. - If groups is a non-negative integer, it is the number of groups to assign - gradient allreduce ops to for explicit grouping. - If groups is a list of list of torch.Tensor. Tensors in the same - inner list will be assigned to the same group, while parameter that does - not appear in any list will form a group itself. - Defaults as None, which is no explicit groups. - sparse_as_dense: If set True, convert all sparse gradients to dense and perform allreduce, then - convert back to sparse before applying the update. - process_set: Gradients will only be reduced over Horovod processes belonging - to this process set. Defaults to the global process set. - """ - # We dynamically create a new class that inherits from the optimizer that was passed in. - # The goal is to override the `step()` method with an allreduce implementation. - if gradient_predivide_factor != 1.0: - if rocm_built(): - raise ValueError('gradient_predivide_factor not supported yet with ROCm') - if op != Average: - raise ValueError('gradient_predivide_factor not supported with op != Average') - - if num_groups != 0: - warnings.warn('Parameter `num_groups` has been replaced by `groups` ' - 'and will be removed in v0.23.0.', DeprecationWarning) - if groups is None: - groups = num_groups - - if backward_passes_per_step <= 0: - raise ValueError("backward_passes_per_step must be > 0") - - if op != Adasum or size() == 1: - cls = type(optimizer.__class__.__name__, (optimizer.__class__,), - dict(_DistributedOptimizer.__dict__)) - return cls(optimizer.param_groups, named_parameters,grace, compression, backward_passes_per_step, op, - gradient_predivide_factor, groups, sparse_as_dense, process_set) - else: - if process_set != global_process_set: - raise NotImplementedError("Adasum does not support non-global process sets yet.") - cls = type(optimizer.__class__.__name__, (optimizer.__class__,), - dict(_DistributedAdasumOptimizer.__dict__)) - return cls(optimizer.param_groups, named_parameters, compression, backward_passes_per_step) From 53462cc23cd825bc63179aa304995571379d87fa Mon Sep 17 00:00:00 2001 From: Garvit Banga Date: Mon, 22 Jul 2024 00:55:44 -0400 Subject: [PATCH 10/44] HOROVOD new version compatible for pytorch --- grace_dl/.DS_Store | Bin 0 -> 6148 bytes grace_dl/dist/.DS_Store | Bin 0 -> 6148 bytes grace_dl/tensorflow/.DS_Store | Bin 0 -> 8196 bytes grace_dl/tensorflow/communicator/.DS_Store | Bin 0 -> 6148 bytes grace_dl/tensorflow/compressor/.DS_Store | Bin 0 -> 6148 bytes grace_dl/tensorflow/memory/.DS_Store | Bin 0 -> 6148 bytes grace_dl/torch/.DS_Store | Bin 0 -> 8196 bytes grace_dl/torch/communicator/.DS_Store | Bin 0 -> 6148 bytes grace_dl/torch/compressor/.DS_Store | Bin 0 -> 6148 bytes grace_dl/torch/memory/.DS_Store | Bin 0 -> 6148 bytes patch_files/.DS_Store | Bin 0 -> 6148 bytes patch_files/horovod/.DS_Store | Bin 0 -> 6148 bytes patch_files/horovod/torch/.DS_Store | Bin 0 -> 6148 bytes 13 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 grace_dl/.DS_Store create mode 100644 grace_dl/dist/.DS_Store create mode 100644 grace_dl/tensorflow/.DS_Store create mode 100644 grace_dl/tensorflow/communicator/.DS_Store create mode 100644 grace_dl/tensorflow/compressor/.DS_Store create mode 100644 grace_dl/tensorflow/memory/.DS_Store create mode 100644 grace_dl/torch/.DS_Store create mode 100644 grace_dl/torch/communicator/.DS_Store create mode 100644 grace_dl/torch/compressor/.DS_Store create mode 100644 grace_dl/torch/memory/.DS_Store create mode 100644 patch_files/.DS_Store create mode 100644 patch_files/horovod/.DS_Store create mode 100644 patch_files/horovod/torch/.DS_Store diff --git a/grace_dl/.DS_Store b/grace_dl/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f530e2364144b4e446d40dbda61eafc6e2c1a79a GIT binary patch literal 6148 zcmeHK&2AGh5FV!~-LwKC0fJnRQLjNHZAw*fSwgsQLM(y4lxB|UNmkj0Y2$ z5C#Jbd3ZmJ1Ch1GG!7D(>nVp*aVp(vZ8mFdx9YsPyFIV-S+lXbRp+g}o%y`toZGx| z?N0YuG>XLw`t*1Z9v*%tRsX1diLJuTnXDHE

11-z=NRRO3VX}ndjzqkVej;HNl_U-OF#OZ{UEbNN^R27RU)9gplBt?8oYM+kUCQ%d zD#ULcB?nf^`#@-8*~O?5VwMp%bsz%uZ^GC=18$3p08Tqu-B z2UhY4fLO+1Y1pQ>1m$pzuEvEz^q?`7il|bVequ0{j^kX%xf&M=RXQ;J_+WZvre7#b zu8#9_84k==J#@zzUCXgN s!9qrH3xzTTD}5cy23^G$ut-B6rv{>{aiI`L(Ci-pNrSB{1HYAluVJbE9{>OV literal 0 HcmV?d00001 diff --git a/grace_dl/dist/.DS_Store b/grace_dl/dist/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..0e5427f8fe4b097ce86acb1b96e871829d3c7ce3 GIT binary patch literal 6148 zcmeHKyG{c^3>-s*BA`e~xxc_4tfKG*`~Z-oKnfhnqoBKv-)8JJ=-@(20ppcDcRtUq zZi@36fXxr9YhVUoN_WJYhq3u{_mQ1s=9Fl$#{tiH#1i+|kE)L+oO?+|hbOKfzr%L5 zTW%h@ZsXMREPBBfYwWnb6`Ya3rZbJ#A@lgb%1Qw#AO)m=6!@nKuxGnXFB&RJ0VyB_ zJ{9oqL!mp?#J*vCIv8REAWoPL<8{mu#Nr8JP3#*oL$gLDHmc=_VU5mw$-0`@H*9oR z4j+~$TTUnzr*r=j<*?dNQ3^_f xHNDps`W^klSR3UG(Ta)Dih1L$_-asB{F={eV&5?6%m4BVYgk literal 0 HcmV?d00001 diff --git a/grace_dl/tensorflow/.DS_Store b/grace_dl/tensorflow/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..354ac1b19a8cd736f5444b51fb428fc7b68e8ed2 GIT binary patch literal 8196 zcmeHM%Wl&^6ur}i#%U?aqLtbeme@ujp+Oa5W74ok7{LNiuw$#WusxRTG^(OV-qYXU z2M`;!EcgWefju9I?=FLYxb z;5jb3ynEw?4xJ9@ByT8dIvh&IG&psC z-P`~1Y_-30CI9Fozm_grQti4=%{cUFc#J-boK2~61Tnaf;BkW#uH0N+-Aad|vR^jq zSjwx~{Jc=La?F5UD*89yf|`VX!79frDY=1Hs*4r%gR`4*jTSbQSFT_vDz@_-6)V{D zO8x%2P^#o}sZgCyK^b^~(-h|^DZ=ckCiN$_P|dAl;1AJaWw!tT literal 0 HcmV?d00001 diff --git a/grace_dl/tensorflow/communicator/.DS_Store b/grace_dl/tensorflow/communicator/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0fgc5S?v9>(Bx@v{HMs#5IVdqD2)KH-tOF2o8XP9b2u1?X}{BsEQ)_4u6O% z2Yv!vIl?bM+?d(jrgmZn^@3=-W9>d?cV<7{i@llz09a?_wE=bjz`#P-*ubj9sGrJ$ zt*Dl5M27JIBZwh@J~%L3@YW5Q0nLDBKr^5j&lQ}vRBud0nNa*WPsie zCKf_VVJ1<$IR58IJuXxg%N%Gl?=B zm<$J|W@aiwVPbZSGle@aOQNnd1Db(F1}Jym#EhLogjYU)=P+=`Y2t?5MW0V@5(eq8 z)%wa-R%fxcvGth8Lnny+fd~jZ zA6;I*3}TOuJA4#-sfe{@hfz1`y~gfja&P~zY3?1gr%iLxZav&tgDzPE*rqGFBB`To38 z!S9(;g)t(2u27XvQJJTtOks>UMW|n!X5ex$uqv0kL;3%c>fiq_*QWGpH3OP~3o*cI zUAx=C5ISzljB;%g%UdjDh_gQo@!mvNim?=C60rsCpZ^eWs?UEx KysoY_1AhR)|7g?z literal 0 HcmV?d00001 diff --git a/grace_dl/torch/communicator/.DS_Store b/grace_dl/torch/communicator/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0S5Z-O8O(;SS3VK`cTCpuw5icRu7cim+m717hgE3p0wmFnS?)pN$h|lB9 z?nboLlOR%NVD{V0&nEkA+37IGxH}13jMW)q7ARuLhT$7QKkAZ{jHL|7)f!VKSr}yL zFo>6;$?zW;z_Y8e0i4?i^wast06ee3Fpko3v-!qL<;vQ6)v8)G>((2{%p3dTN$UEe zYkazpG7gHlA6!QLtXJPWmC4wTlKw~~L}4F7Zmyyvl$k3hNtmi!PdhBzvU~NN*=(#g?Ar6i^^SNzp?e3qP_a5V?L_TXSIsELDY#1!y1ttsI3e)>IfX~E~ z1=rLYC9zB%z*8((EJR2Q5Cg=(Dl%a1I;*ydo1;Au1H{0u7{L2MfFe2ubB*fifI+nY zfE6$+fjRaPm?I2226K(z0pU6oP^WU!VsM=fc46WigSkeX&bVnlxOp=-9ST=(hxLUD zXWY?9BQZb>d}RQ8Kae`s|FfU_e-%UnF+dEgCIh_E^|~(X&D7SVU1F^jpm(4s7?*4O lNP&qe#Sn|7cpFp+*ac1i9fP?>uz=7X0Yw81#K502@D6?|Q(6E3 literal 0 HcmV?d00001 diff --git a/patch_files/horovod/.DS_Store b/patch_files/horovod/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..dc24be0dfc0404f529289b7176962ea231d27203 GIT binary patch literal 6148 zcmeHK%}&BV5Z(opB}T(R6G=R5;uRwpe?q)e#22te4{DH7l*ZB}w8(*w^sZ;$#TW1q zd+(xSXAMfhz`8(GT2E#oBs+mo0)+a|hqE<8CF}q*6d|oI#?!EVLzwqd`6B+bk6ZvoKUf_%1 z3T7Q|L@xzfWB?gJ2ELyGbMCOY?@xspkO5@ik1-(62M#LHG8k)AM+Y>j1OUu{Sqb!I zEg@sLLCavQ5k^3$4h7Vq)RY)hhl8CPKg(dOQHK+1iVtcsQ&XW(l^y2i%AHV4Beuu@ zGBC+NT0iFH{eSRz{XZFmPsji=@UIx4nVMaz!j{b4I<+}@*K*JcP$|+c*7z0z9d#9h fue^$vL6v}=%MPGrFxChb5d0BPG+=`a{3rvjCL?VB literal 0 HcmV?d00001 diff --git a/patch_files/horovod/torch/.DS_Store b/patch_files/horovod/torch/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Mon, 22 Jul 2024 00:59:37 -0400 Subject: [PATCH 11/44] HOROVOD new version compatible for pytorch --- .DS_Store | Bin 8196 -> 8196 bytes grace_dl/.DS_Store | Bin 6148 -> 6148 bytes grace_dl/tensorflow/.DS_Store | Bin 8196 -> 8196 bytes grace_dl/torch/.DS_Store | Bin 8196 -> 8196 bytes 4 files changed, 0 insertions(+), 0 deletions(-) diff --git a/.DS_Store b/.DS_Store index bad07e965141e67ad73cab84e3ab9c0ed6fb1380..830ec02fc267174df5e1eb296292f2654cb75bd4 100644 GIT binary patch delta 33 kcmZp1XmQx!FT`YKI5|)#j*)Tm2BBa^2+wu1m&h(|0H<6DoB#j- delta 33 kcmZp1XmQx!FT`YOIXO@$j*(&W2BBa^2+wu1m&h(|0H`(zsQ>@~ diff --git a/grace_dl/.DS_Store b/grace_dl/.DS_Store index f530e2364144b4e446d40dbda61eafc6e2c1a79a..7c72073262c1b15b1b9e63219482b21ec115091a 100644 GIT binary patch delta 40 vcmZoMXfc?Ouvw5}HS=U{7Ewlq&5A6gjEoGE_p?fDUe7ug$lUD9zFhzS>f{R@ delta 37 scmZoMXfc?Oz{t2UaX&L7<7P#cQbtC`$@^I)Hm_%$3uJEgW#29U0Mhab#Q*>R diff --git a/grace_dl/tensorflow/.DS_Store b/grace_dl/tensorflow/.DS_Store index 354ac1b19a8cd736f5444b51fb428fc7b68e8ed2..1cafe259310a11e1a521db1a2a204070823cadc2 100644 GIT binary patch delta 14 VcmZp1XmQxEQiPFl^D2=jZU8721kL~e delta 14 VcmZp1XmQxEQiPFV^D2=jZU86{1kC^d diff --git a/grace_dl/torch/.DS_Store b/grace_dl/torch/.DS_Store index 2715dccc4ff900755cb3c6b8d258e8090206336a..1b1428a405518204e00c4423c42b291248b8985c 100644 GIT binary patch delta 14 VcmZp1XmQxEQiPFV^D2=jZU86{1kC^d delta 14 VcmZp1XmQxEQiPFl^D2=jZU8721kL~e From d1f3b62921e58d65d7aa2f1f51ee0772f6576151 Mon Sep 17 00:00:00 2001 From: Garvit Banga Date: Mon, 22 Jul 2024 01:03:29 -0400 Subject: [PATCH 12/44] HOROVOD new version compatible for pytorch --- .DS_Store | Bin 8196 -> 8196 bytes README.md | 68 ++- examples/.DS_Store | Bin 0 -> 6148 bytes grace_dl/.DS_Store | Bin 6148 -> 6148 bytes grace_dl/torch/.DS_Store | Bin 8196 -> 8196 bytes grace_dl/torch/communicator/allgather.py | 15 +- grace_dl/torch/compressor/topk.py | 5 +- grace_dl/torch/memory/residual.py | 5 +- patch_files/.DS_Store | Bin 6148 -> 6148 bytes patch_files/horovod/.DS_Store | Bin 6148 -> 6148 bytes patch_files/horovod/torch/__init__.py | 57 +- patch_files/horovod/torch/mpi_ops.py | 712 +++++++++++++++++++---- patch_files/horovod/torch/optimizer.py | 173 ++++-- 13 files changed, 842 insertions(+), 193 deletions(-) create mode 100644 examples/.DS_Store diff --git a/.DS_Store b/.DS_Store index 830ec02fc267174df5e1eb296292f2654cb75bd4..5043af2638c57c7830bb83a3d0f38dce8dfe20ca 100644 GIT binary patch delta 23 fcmZp1XmQxEKwz?;kk{r{0!JAc88&Ya3gHC+ZeIv+ delta 32 ocmZp1XmQxEK!C%<*hELc(A;=3mx#mW7Xrr_85uWk5DMW10HfszuK)l5 diff --git a/README.md b/README.md index feaed8f..a777d33 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,5 @@ # GRACE - GRAdient ComprEssion for distributed deep learning -## Updated GRACE for new version of Horovod in Pytorch -Updated Patch files - - - ## Context Powerful computer clusters are used nowadays to train complex deep neural networks (DNN) on large datasets. Distributed training workloads increasingly become communication bound. For this reason, many lossy compression techniques have been proposed to reduce the volume of transferred data, and consequently, accelerate training. @@ -14,3 +9,66 @@ It provides a general framework and an API for implementing gradient compression GRACE includes the implementation of popular compression algorithms surveyed in [GRACE: A Compressed Communication Framework for Distributed Machine Learning](https://sands.kaust.edu.sa/papers/grace.icdcs21.pdf). GRACE supports both TensorFlow and Pytorch, and offers easy integration with training scripts. This simplifies the process of implemeting and comparing across compression methods. +![grace_bert_bench](https://user-images.githubusercontent.com/52653060/174394060-8f616ed0-0fca-424d-ab9f-2648423d11c2.jpg) + + +## Install +The `grace_dl` package can be install locally on your nodes. For Horovod support, it is required to apply +the patch files provided. You can find more information [here](INSTALLING.md). + +## Concepts +The three main components of this framework are the `Communicator`, `Compressor`, and `Memory` abstract classes. +- `Communicator` implementations define the communication method used by GRACE. +- `Compressor` implementations provide the `compress` and `decompress` methods. +- `Memory` implementations provide the `update` and `compensate` methods. + +## Supported frameworks +Currently the supported frameworks are: +- Horovod 0.21.0 ([TensorFlow 1.15](grace_dl/tensorflow)) +- Horovod 0.21.0 ([PyTorch 1.7.0](grace_dl/torch)) +- DistributedDataParallel (DDP) ([PyTorch >= 1.4.0](grace_dl/dist)) + +## Usage +To use GRACE: +- **As a User**: you need to change a few lines of code on your training script. Refer [here](TRAINING.md) for examples. +- **As an Implementer of new Compressors**: you need to implement a `Compressor` class, and optionally a `Memory` or even a `Communicator` class. +Refer [here](IMPLEMENTING.md) for more information. +- **For Benchmarking methods**: you can use the [training scripts](https://github.com/sands-lab/grace-benchmarks) and compare with [our data](https://github.com/sands-lab/grace-data). + +## Running +Running a training script is no different with GRACE. See the Training guide for more information. + +## Guides +- [Installing GRACE](INSTALLING.md) +- [Implementing GRACE components](IMPLEMENTING.md) +- [Training with GRACE](TRAINING.md) + +## Citation + +If you find this useful, please cite our work as: + +Hang Xu, Chen-Yu Ho, Ahmed M Abdelmoniem, Aritra Dutta, El Houcine Bergou, Konstantinos Karatsenidis, Marco Canini, and Panos Kalnis. GRACE: A Compressed Communication Framework for Distributed Machine Learning. In Proc. of ICDCS, 2021. + +Here's a BibTeX blurb: + +``` +@inproceedings{xu2021grace, + title={GRACE: A compressed communication framework for distributed machine learning}, + author={Xu, Hang and Ho, Chen-Yu and Abdelmoniem, Ahmed M. and Dutta, Aritra and Bergou, EH and Karatsenidis, Konstantinos and Canini, Marco and Kalnis, Panos}, + booktitle={Proc. of 41st IEEE Int. Conf. Distributed Computing Systems (ICDCS)}, + year={2021} +} +``` + +## Publications + +* [GRACE: A Compressed Communication Framework for Distributed Machine Learning](https://sands.kaust.edu.sa/papers/grace.icdcs21.pdf).
+ H. Xu, C.-Y. Ho, A. M. Abdelmoniem, A. Dutta, E. H. Bergou, K. Karatsenidis, M. Canini, P. Kalnis.
+ In Proc. of ICDCS, 2021. + +## Mailing lists + +There is no official mailing list set up yet. Feel free to reach out by email to any of the authors indicated in the [paper](https://sands.kaust.edu.sa/papers/grace.icdcs21.pdf). + +## License +See [LICENSE](LICENSE) for the details. diff --git a/examples/.DS_Store b/examples/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..4fd8dd2eaf137c68d1dd38c85df602c3ff83ce22 GIT binary patch literal 6148 zcmeHK%}T>S5Z<+|O({YS3VK`cTCpiw5icRu7cim+m73UMgE3pu)Er77cYPsW#OHBl zcLOaJk0N#kcE8#A+0A^A{b7u8Z#r-ovl(L+G(?Wd3PJNq*MsH-rTKE1)=6)7r(}@?Pw>Y|zG7d{U z2(P1I-nX~UWs(I^GE7xMG#En2-A$AXWbVmnGMK1bUpuUZ)#%%Mv)Mt{X^BqPnYYBu zX`ggj;^_2vK5tk%`-f+j{pa{4k#Cww4wNg|GFZYpC<`UM`e_o&=NM(}`e zoeHQ^xp`u6oep+k;#`A;MxD;MS{ddsE0>QKu2u)TP~nWb8mT7+h=FAWDtc(+`F{bw z%*sdpate)z0b<~vF~F-mzvn?w=4|~|9-g%V+5?KaI(LM>*fyvvl*f6$)ANpH$N9j1MxTei)`fv07{t-hyVZp delta 43 scmZp1XmQxEQh>?Sa*fyvvl*f6$)ANpH$N9j1MxTei)`fv08B^@od5s; diff --git a/grace_dl/torch/communicator/allgather.py b/grace_dl/torch/communicator/allgather.py index a598475..ff95a6c 100644 --- a/grace_dl/torch/communicator/allgather.py +++ b/grace_dl/torch/communicator/allgather.py @@ -1,6 +1,6 @@ import torch -from grace_dl.torch import Communicator +from grace.grace_dl.torch import Communicator from horovod.torch import allgather, allgather_async, synchronize @@ -9,7 +9,8 @@ def __init__(self, compressor, memory, world_size): super().__init__(compressor, memory) self.world_size = world_size - def async_send(self, tensors_compressed, name): + + def async_send(self, tensors_compressed, name,process_set): """ :param tensors_compressed: list of flat tensors to communicate :param name: for the all_gather operation @@ -26,21 +27,25 @@ def async_send(self, tensors_compressed, name): tensor_sizes = zip(*tensors_size_ag) # transpose else: tensors_size = torch.tensor(tensors_size) # TODO: set device - gathered = allgather(tensors_size) # tensor of tensor sizes per rank + gathered = allgather(tensor=tensors_size,process_set=process_set) # tensor of tensor sizes per rank tensor_sizes = gathered.view([self.world_size, -1]).t().tolist() # transpose, to list handles = [] for tensor_compressed in tensors_compressed: - handle = allgather_async(tensor_compressed) + handle = allgather_async(tensor=tensor_compressed,process_set=process_set) handles.append(handle) return handles, tensor_sizes - def wait_receive(self, result, ctx): + def wait_receive(self, result, ctx,process_set): handles, tensor_sizes = result + # print("Result",result) tensors_ag = [] for handle, sizes in zip(handles, tensor_sizes): gathered = synchronize(handle) + # print("gathered",gathered) + # print("gathered",len(gathered)) + # print("sizes",sizes) tensors_ag.append(gathered.split(sizes)) # import time # time.sleep(0.001) diff --git a/grace_dl/torch/compressor/topk.py b/grace_dl/torch/compressor/topk.py index 6d2a750..9ccfc2d 100644 --- a/grace_dl/torch/compressor/topk.py +++ b/grace_dl/torch/compressor/topk.py @@ -1,6 +1,6 @@ import torch -from grace_dl.torch import Compressor +from grace.grace_dl.torch import Compressor def sparsify(tensor, compress_ratio): @@ -13,6 +13,7 @@ def sparsify(tensor, compress_ratio): def desparsify(tensors, numel): values, indices = tensors + # print("values, indices",values, indices,numel) tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) tensor_decompressed.scatter_(0, indices, values) return tensor_decompressed @@ -21,7 +22,7 @@ def desparsify(tensors, numel): class TopKCompressor(Compressor): def __init__(self, compress_ratio): - super().__init__() + super().__init__()#tensors_size_are_same=False self.compress_ratio = compress_ratio def compress(self, tensor, name): diff --git a/grace_dl/torch/memory/residual.py b/grace_dl/torch/memory/residual.py index a2ae61f..6054d25 100644 --- a/grace_dl/torch/memory/residual.py +++ b/grace_dl/torch/memory/residual.py @@ -1,4 +1,4 @@ -from grace_dl.torch import Memory +from grace.grace_dl.torch import Memory class ResidualMemory(Memory): @@ -9,12 +9,15 @@ def __init__(self, beta=1.0, gamma=1.0): def compensate(self, tensor, name): """Update the tensor with the residuals.""" + # print("inside residuals compensate") if name in self.residuals: tensor = self.beta * self.residuals[name] + self.gamma * tensor + # print("inside residuals",self.residuals[name] ) return tensor def update(self, tensor, name, compressor, tensor_compressed, ctx): """Update the residuals.""" tensor_decompressed = compressor.decompress(tensor_compressed, ctx) residual = tensor - tensor_decompressed + # print("residual update",residual) self.residuals[name] = residual diff --git a/patch_files/.DS_Store b/patch_files/.DS_Store index 3a3d06f26a7ed932239fa6dfc74135e387e705ff..6c2a8c29066cc5b66b19eb9f8e0b1de8b7f7ab1d 100644 GIT binary patch delta 15 WcmZoMXffC@pNYxLaPvZ@WKjSr%LMTN delta 15 WcmZoMXffC@pNYxTa`Qr_WKjSr^aS_- diff --git a/patch_files/horovod/.DS_Store b/patch_files/horovod/.DS_Store index dc24be0dfc0404f529289b7176962ea231d27203..43d7721ef476f915abed16f57aeeddb43526b4df 100644 GIT binary patch delta 139 zcmZoMXfc@JFUrNhz`)4BAi%(o$WWY8oSc)CpP#dNBJ*;_AdnOrLp(z^Ln=cNvNTYf zffc(j7efh9bsj@8Lq1SX8bc0{E{7S*Fqwy`pUKQ{@?53}MuyGbm=3UPX6N|J4*>Um BBQpR1 delta 59 zcmZoMXfc@JFU-!sz`)4BAi%(okyKt>kd%|3wD}?PN=8QZ$+1lROs1BT-!VlnGH!Nf PKEN`uA#gK0$6tN`)DRIq diff --git a/patch_files/horovod/torch/__init__.py b/patch_files/horovod/torch/__init__.py index d17c61d..7ad827e 100644 --- a/patch_files/horovod/torch/__init__.py +++ b/patch_files/horovod/torch/__init__.py @@ -16,33 +16,44 @@ from horovod.common.util import check_extension +_MPI_LIB_AVAILABLE = True try: check_extension('horovod.torch', 'HOROVOD_WITH_PYTORCH', __file__, 'mpi_lib_v2') -except: - check_extension('horovod.torch', 'HOROVOD_WITH_PYTORCH', - __file__, 'mpi_lib', '_mpi_lib') - -from horovod.torch import elastic -from horovod.torch.compression import Compression -from horovod.torch.functions import allgather_object, broadcast_object, broadcast_optimizer_state, broadcast_parameters -from horovod.torch.mpi_ops import allreduce, allreduce_async, allreduce_, allreduce_async_ -from horovod.torch.mpi_ops import grouped_allreduce, grouped_allreduce_async, grouped_allreduce_, grouped_allreduce_async_ -from horovod.torch.mpi_ops import allgather, allgather_async -from horovod.torch.mpi_ops import broadcast, broadcast_async, broadcast_, broadcast_async_ -from horovod.torch.mpi_ops import alltoall, alltoall_async -from horovod.torch.mpi_ops import join -from horovod.torch.mpi_ops import poll, synchronize -from horovod.torch.mpi_ops import init, shutdown -from horovod.torch.mpi_ops import is_initialized, start_timeline, stop_timeline -from horovod.torch.mpi_ops import size, local_size, rank, local_rank -from horovod.torch.mpi_ops import mpi_threads_supported, mpi_enabled, mpi_built -from horovod.torch.mpi_ops import gloo_enabled, gloo_built -from horovod.torch.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built -from horovod.torch.mpi_ops import Average, Sum, Adasum -from horovod.torch.optimizer import DistributedOptimizer -from horovod.torch.sync_batch_norm import SyncBatchNorm +except Exception as e: + # MPI libs are missing, but python applications are still available. + print(e) + print("Warning! MPI libs are missing, but python applications are still available.") + _MPI_LIB_AVAILABLE = False +# only import following function when mpi is available. +if _MPI_LIB_AVAILABLE: + from horovod.torch import elastic + from horovod.torch.compression import Compression + from horovod.torch.functions import allgather_object, broadcast_object, broadcast_optimizer_state, broadcast_parameters + from horovod.torch.mpi_ops import allreduce, allreduce_async, allreduce_, allreduce_async_ + from horovod.torch.mpi_ops import grouped_allreduce, grouped_allreduce_async, grouped_allreduce_, grouped_allreduce_async_ + from horovod.torch.mpi_ops import sparse_allreduce_async + from horovod.torch.mpi_ops import allgather, allgather_async + from horovod.torch.mpi_ops import grouped_allgather, grouped_allgather_async + from horovod.torch.mpi_ops import broadcast, broadcast_async, broadcast_, broadcast_async_ + from horovod.torch.mpi_ops import alltoall, alltoall_async + from horovod.torch.mpi_ops import reducescatter, reducescatter_async + from horovod.torch.mpi_ops import grouped_reducescatter, grouped_reducescatter_async + from horovod.torch.mpi_ops import join + from horovod.torch.mpi_ops import barrier + from horovod.torch.mpi_ops import poll, synchronize + from horovod.torch.mpi_ops import init, shutdown + from horovod.torch.mpi_ops import is_initialized, start_timeline, stop_timeline + from horovod.torch.mpi_ops import size, local_size, cross_size, rank, local_rank, cross_rank + from horovod.torch.mpi_ops import mpi_threads_supported, mpi_enabled, mpi_built + from horovod.torch.mpi_ops import gloo_enabled, gloo_built + from horovod.torch.mpi_ops import nccl_built, ddl_built, ccl_built, cuda_built, rocm_built + from horovod.torch.mpi_ops import ProcessSet, global_process_set, add_process_set, remove_process_set + from horovod.torch.mpi_ops import Average, Sum, Adasum, Min, Max, Product + from horovod.torch.mpi_ops import HorovodInternalError + from horovod.torch.optimizer import DistributedOptimizer + from horovod.torch.sync_batch_norm import SyncBatchNorm # Please run this function in a subprocess def _check_has_gpu(): diff --git a/patch_files/horovod/torch/mpi_ops.py b/patch_files/horovod/torch/mpi_ops.py index a4c16f7..8ee3474 100644 --- a/patch_files/horovod/torch/mpi_ops.py +++ b/patch_files/horovod/torch/mpi_ops.py @@ -20,25 +20,37 @@ import warnings -from horovod.torch import mpi_lib_v2 as mpi_lib from horovod.common.basics import HorovodBasics as _HorovodBasics -_NULL = "" -_basics = _HorovodBasics(__file__, 'mpi_lib_v2') - from horovod.common.exceptions import HorovodInternalError -from horovod.common.util import get_average_backwards_compatibility_fun, gpu_available, num_rank_is_power_2 +from horovod.common.process_sets import _setup as _setup_process_sets +from horovod.common.process_sets import ProcessSet, global_process_set, add_process_set, remove_process_set +from horovod.common.util import check_installed_version, get_average_backwards_compatibility_fun, gpu_available, num_rank_is_power_2 from horovod.torch.compression import Compression +# Check possible symbol not found error from pytorch version mismatch +try: + from horovod.torch import mpi_lib_v2 as mpi_lib +except Exception as e: + check_installed_version('pytorch', torch.__version__, e) + raise e +else: + check_installed_version('pytorch', torch.__version__) + +_NULL = "" + +_basics = _HorovodBasics(__file__, 'mpi_lib_v2') + # import basic methods -init = _basics.init is_initialized = _basics.is_initialized start_timeline = _basics.start_timeline stop_timeline = _basics.stop_timeline size = _basics.size local_size = _basics.local_size +cross_size = _basics.cross_size rank = _basics.rank local_rank = _basics.local_rank +cross_rank = _basics.cross_rank mpi_threads_supported = _basics.mpi_threads_supported mpi_enabled = _basics.mpi_enabled mpi_built = _basics.mpi_built @@ -49,19 +61,32 @@ ccl_built = _basics.ccl_built cuda_built = _basics.cuda_built rocm_built = _basics.rocm_built + def shutdown(*args, **kwargs): mpi_lib.horovod_torch_reset() return _basics.shutdown(*args, **kwargs) +def init(*args, **kwargs): + global _handle_map + _handle_map = {} + _basics.init(*args, **kwargs) + # Call set up again to make sure the basics is in sync + _setup_process_sets(_basics) + # import reduction op values Average = _basics.Average Sum = _basics.Sum Adasum = _basics.Adasum +Min = _basics.Min +Max = _basics.Max +Product = _basics.Product is_homogeneous = _basics.is_homogeneous handle_average_backwards_compatibility = get_average_backwards_compatibility_fun(_basics) +_setup_process_sets(_basics) + # Schema: handle -> input, output # We keep input in order to make sure it does not get garbage collected @@ -82,17 +107,19 @@ def _allreduce_function_factory(tensor): return 'horovod_torch_allreduce_async_' + tensor.type().replace('.', '_') -def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor): +def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor, process_set: ProcessSet): # Set the divisor for reduced gradients to average when necessary if op == Average: if rocm_built(): # For ROCm, perform averaging at framework level - divisor = size() + divisor = process_set.size() op = Sum else: divisor = 1 elif op == Adasum: + if process_set != global_process_set: + raise NotImplementedError("Adasum does not support non-global process sets yet.") if tensor.device.type != 'cpu' and gpu_available('torch'): if nccl_built(): if not is_homogeneous(): @@ -120,7 +147,7 @@ def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor try: handle = getattr(mpi_lib, function)(tensor, output, divisor, name.encode() if name is not None else _NULL, op, - prescale_factor, postscale_factor) + prescale_factor, postscale_factor, process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tensor, output) @@ -128,7 +155,8 @@ def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor def allreduce_async(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs asynchronous averaging or summation of the input tensor over all the Horovod processes. The input tensor is not modified. @@ -143,13 +171,16 @@ def allreduce_async(tensor, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A name of the reduction operation. - op: The reduction operation to combine tensors across different - ranks. Defaults to Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the allreduce operation that can be used with `poll()` or @@ -157,30 +188,32 @@ def allreduce_async(tensor, average=None, name=None, op=None, """ op = handle_average_backwards_compatibility(op, average) output = tensor.new(tensor.shape) - return _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor) + return _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor, process_set) class HorovodAllreduce(torch.autograd.Function): """An autograd function that performs allreduce on a tensor.""" @staticmethod - def forward(ctx, tensor, average, name, op, prescale_factor, postscale_factor): + def forward(ctx, tensor, average, name, op, prescale_factor, postscale_factor, process_set): ctx.average = average ctx.op = op ctx.prescale_factor = prescale_factor ctx.postscale_factor = postscale_factor - handle = allreduce_async(tensor, average, name, op, prescale_factor, postscale_factor) + ctx.process_set = process_set + handle = allreduce_async(tensor, average, name, op, prescale_factor, postscale_factor, process_set) return synchronize(handle) @staticmethod def backward(ctx, grad_output): return allreduce(grad_output, average=ctx.average, op=ctx.op, prescale_factor=ctx.prescale_factor, - postscale_factor=ctx.postscale_factor), None, None, None, None, None + postscale_factor=ctx.postscale_factor, + process_set=ctx.process_set), None, None, None, None, None, None def allreduce(tensor, average=None, name=None, compression=Compression.none, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, process_set=global_process_set): """ A function that performs averaging or summation of the input tensor over all the Horovod processes. The input tensor is not modified. @@ -199,16 +232,19 @@ def allreduce(tensor, average=None, name=None, compression=Compression.none, op= average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A name of the reduction operation. compression: Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Defaults to not using compression. - op: The reduction operation to combine tensors across different ranks. Defaults + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, averaged or summed across all @@ -216,12 +252,14 @@ def allreduce(tensor, average=None, name=None, compression=Compression.none, op= """ tensor_compressed, ctx = compression.compress(tensor) summed_tensor_compressed = HorovodAllreduce.apply(tensor_compressed, average, name, op, - prescale_factor, postscale_factor) + prescale_factor, postscale_factor, + process_set) return compression.decompress(summed_tensor_compressed, ctx) def allreduce_async_(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs asynchronous in-place averaging or summation of the input tensor over all the Horovod processes. @@ -236,24 +274,28 @@ def allreduce_async_(tensor, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. Defaults to - Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the allreduce operation that can be used with `poll()` or `synchronize()`. """ op = handle_average_backwards_compatibility(op, average) - return _allreduce_async(tensor, tensor, name, op, prescale_factor, postscale_factor) + return _allreduce_async(tensor, tensor, name, op, prescale_factor, postscale_factor, process_set) def allreduce_(tensor, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs in-place averaging or summation of the input tensor over all the Horovod processes. @@ -268,19 +310,22 @@ def allreduce_(tensor, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A name of the reduction operation. - op: The reduction operation to combine tensors across different ranks. Defaults to - Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, averaged or summed across all processes. """ - handle = allreduce_async_(tensor, average, name, op, prescale_factor, postscale_factor) + handle = allreduce_async_(tensor, average, name, op, prescale_factor, postscale_factor, process_set) return synchronize(handle) @@ -288,16 +333,18 @@ def _grouped_allreduce_function_factory(tensor): return 'horovod_torch_grouped_allreduce_async_' + tensor.type().replace('.', '_') -def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor): +def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor, process_set: ProcessSet): # Set the divisor for reduced gradients to average when necessary if op == Average: if rocm_built(): # For ROCm, perform averaging at framework level - divisor = size() + divisor = process_set.size() op = Sum else: divisor = 1 elif op == Adasum: + if process_set != global_process_set: + raise NotImplementedError("Adasum does not support non-global process sets yet.") if tensors[0].device.type != 'cpu' and gpu_available('torch'): if nccl_built(): if not is_homogeneous(): @@ -325,7 +372,7 @@ def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postsc try: handle = getattr(mpi_lib, function)(tensors, outputs, divisor, name.encode() if name is not None else _NULL, op, - prescale_factor, postscale_factor) + prescale_factor, postscale_factor, process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tuple(tensors), tuple(outputs)) @@ -333,7 +380,8 @@ def _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postsc def grouped_allreduce_async(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs asynchronous averaging or summation of the input tensor list over all the Horovod processes. The input tensors are not modified. @@ -350,13 +398,16 @@ def grouped_allreduce_async(tensors, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different - ranks. Defaults to Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the group allreduce operation that can be used with `poll()` or @@ -364,31 +415,34 @@ def grouped_allreduce_async(tensors, average=None, name=None, op=None, """ op = handle_average_backwards_compatibility(op, average) outputs = [t.new(t.shape) for t in tensors] - return _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor) + return _grouped_allreduce_async(tensors, outputs, name, op, prescale_factor, postscale_factor, process_set) class HorovodGroupedAllreduce(torch.autograd.Function): """An autograd function that performs allreduce on a list of tensors.""" @staticmethod - def forward(ctx, average, name, op, prescale_factor, postscale_factor, *tensors): + def forward(ctx, average, name, op, prescale_factor, postscale_factor, process_set: ProcessSet, *tensors): ctx.average = average ctx.op = op ctx.prescale_factor = prescale_factor ctx.postscale_factor = postscale_factor - handle = grouped_allreduce_async(list(tensors), average, name, op, prescale_factor, postscale_factor) + ctx.process_set = process_set + handle = grouped_allreduce_async(list(tensors), average, name, op, prescale_factor, postscale_factor, + process_set) return synchronize(handle) @staticmethod def backward(ctx, *grad_output): grad_reduced = grouped_allreduce(list(grad_output), average=ctx.average, op=ctx.op, prescale_factor=ctx.prescale_factor, - postscale_factor=ctx.postscale_factor) - return (None, None, None, None, None, *grad_reduced) + postscale_factor=ctx.postscale_factor, + process_set=ctx.process_set) + return (None, None, None, None, None, None, *grad_reduced) def grouped_allreduce(tensors, average=None, name=None, compression=Compression.none, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, process_set=global_process_set): """ A function that performs averaging or summation of the input tensor list over all the Horovod processes. The input tensors are not modified. @@ -409,16 +463,19 @@ def grouped_allreduce(tensors, average=None, name=None, compression=Compression. average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A base name to use for the group reduction operation. compression: Compression algorithm used during allreduce to reduce the amount of data sent during the each parameter update step. Defaults to not using compression. - op: The reduction operation to combine tensors across different ranks. Defaults + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A list containing tensors of the same shape and type as in `tensors`, @@ -427,12 +484,13 @@ def grouped_allreduce(tensors, average=None, name=None, compression=Compression. tensors_compressed, ctxs = zip(*[compression.compress(t) for t in tensors]) summed_tensors_compressed = HorovodGroupedAllreduce.apply(average, name, op, prescale_factor, postscale_factor, - *tensors_compressed) + process_set, *tensors_compressed) return [compression.decompress(t, ctx) for t, ctx in zip(summed_tensors_compressed, ctxs)] def grouped_allreduce_async_(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs asynchronous in-place averaging or summation of the input tensors over all the Horovod processes. @@ -449,24 +507,28 @@ def grouped_allreduce_async_(tensors, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. Defaults to - Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the group allreduce operation that can be used with `poll()` or `synchronize()`. """ op = handle_average_backwards_compatibility(op, average) - return _grouped_allreduce_async(tensors, tensors, name, op, prescale_factor, postscale_factor) + return _grouped_allreduce_async(tensors, tensors, name, op, prescale_factor, postscale_factor, process_set) def grouped_allreduce_(tensors, average=None, name=None, op=None, - prescale_factor=1.0, postscale_factor=1.0): + prescale_factor=1.0, postscale_factor=1.0, + process_set=global_process_set): """ A function that performs in-place averaging or summation of the input tensors over all the Horovod processes. @@ -483,38 +545,66 @@ def grouped_allreduce_(tensors, average=None, name=None, op=None, average: .. warning:: .. deprecated:: 0.19.0 - Use `op` instead. Will be removed in v0.21.0. + Use `op` instead. Will be removed in v1.0. name: A base name to use for the group reduction operation. - op: The reduction operation to combine tensors across different ranks. Defaults to - Average if None is given. + op: The reduction operation to combine tensors across different ranks. + Supported op values are Sum, Average, Min, Max, and Product. Defaults + to Average if None is given. prescale_factor: Multiplicative factor to scale tensor before allreduce. postscale_factor: Multiplicative factor to scale tensor after allreduce. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A list containing tensors of the same shape and type as in `tensors`, averaged or summed across all processes. """ - handle = grouped_allreduce_async_(tensors, average, name, op, prescale_factor, postscale_factor) + handle = grouped_allreduce_async_(tensors, average, name, op, prescale_factor, postscale_factor, process_set) return synchronize(handle) +def sparse_allreduce_async(tensor, name, op, process_set=global_process_set): + # Allgather aggregates along the first dimension, so we need to transpose the + # indices to enforce correct concatenation behavior, then transpose back prior to + # constructing the new aggregated sparse gradient + t = tensor + indices_handle = allgather_async(t._indices().transpose(0, 1).contiguous(), name=f'{name}.indices', + process_set=process_set) + values_handle = allgather_async(t._values(), name=f'{name}.values', process_set=process_set) + + def handle(): + # We need to sync values handle firstly for torch nightly >= 10.0 + # Issue: https://github.com/horovod/horovod/issues/2961 + values = synchronize(values_handle) + indices = synchronize(indices_handle) + + values = (values / process_set.size()) if op == Average else values + + if indices.dim() == 0 or values.dim() == 0: + return t.new().resize_as_(t) + return t.new(indices.transpose(0, 1), values, t.size()) + + return handle + + def _allgather_function_factory(tensor): return 'horovod_torch_allgather_async_' + tensor.type().replace('.', '_') -def _allgather_async(tensor, output, name): +def _allgather_async(tensor, output, name, process_set: ProcessSet): function = _check_function(_allgather_function_factory, tensor) try: handle = getattr(mpi_lib, function)( - tensor, output, name.encode() if name is not None else _NULL) + tensor, output, name.encode() if name is not None else _NULL, + process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tensor, output) return handle -def allgather_async(tensor, name=None): +def allgather_async(tensor, name=None, process_set=global_process_set): """ A function that asynchronously concatenates the input tensor with the same input tensor on all other Horovod processes. The input tensor is not modified. @@ -526,44 +616,47 @@ def allgather_async(tensor, name=None): Arguments: tensor: A tensor to allgather. name: A name of the allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the allgather operation that can be used with `poll()` or `synchronize()`. """ output = tensor.new() - return _allgather_async(tensor, output, name) + return _allgather_async(tensor, output, name, process_set) class HorovodAllgather(torch.autograd.Function): """An autograd function that performs allgather on a tensor.""" @staticmethod - def forward(ctx, tensor, name): + def forward(ctx, tensor, name, process_set: ProcessSet): ctx.dim = tensor.shape[0] - handle = allgather_async(tensor, name) + ctx.process_set = process_set + handle = allgather_async(tensor, name, process_set) return synchronize(handle) @staticmethod def backward(ctx, grad_output): - grad_reduced = allreduce(grad_output, average=False) + grad_reduced = allreduce(grad_output, average=False, process_set=ctx.process_set)#average CHANGE dim_t = torch.IntTensor([ctx.dim]) - dim = allgather(dim_t).view(size()) + dim = allgather(dim_t, process_set=ctx.process_set).view(ctx.process_set.size()) - r = rank() + r = ctx.process_set.rank() offset = torch.sum(dim.narrow(0, 0, r)).item() if r != 0 else 0 - return grad_reduced.narrow(0, offset, ctx.dim), None + return grad_reduced.narrow(0, offset, ctx.dim), None, None -def allgather(tensor, name=None): +def allgather(tensor, name=None, process_set=global_process_set): """ A function that concatenates the input tensor with the same input tensor on all other Horovod processes. The input tensor is not modified. - The concatenation is done on the first dimension, so the input tensors on the - different processes must have the same rank and shape, except for the first - dimension, which is allowed to be different. + The concatenation is done on the first dimension, so the corresponding + input tensors on the different processes must have the same rank and + shape, except for the first dimension, which is allowed to be different. This acts as a thin wrapper around an autograd function. If your input tensor requires gradients, then callings this function will allow gradients @@ -572,6 +665,8 @@ def allgather(tensor, name=None): Arguments: tensor: A tensor to allgather. name: A name of the allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A tensor of the same type as `tensor`, concatenated on dimension zero @@ -579,25 +674,118 @@ def allgather(tensor, name=None): the first dimension, which may be greater and is the sum of all first dimensions of the tensors in different Horovod processes. """ - return HorovodAllgather.apply(tensor, name) + return HorovodAllgather.apply(tensor, name, process_set) + + +def _grouped_allgather_function_factory(tensor): + return 'horovod_torch_grouped_allgather_async_' + tensor.type().replace('.', '_') + + +def _grouped_allgather_async(tensors, outputs, name, process_set: ProcessSet): + function = _check_function(_grouped_allgather_function_factory, tensors[0]) + try: + handle = getattr(mpi_lib, function)( + tensors, outputs, name.encode() if name is not None else _NULL, + process_set.process_set_id) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tuple(tensors), tuple(outputs)) + return handle + + +def grouped_allgather_async(tensors, name=None, process_set=global_process_set): + """ + A function that asynchronously concatenates each input tensor with the corresponding + input tensor on all other Horovod processes for a list of input tensors. The input + tensors are not modified. + + The concatenation is done on the first dimension, so the input tensors on the + different processes must have the same rank and shape, except for the first + dimension, which is allowed to be different. + + Arguments: + tensors: A list of tensors to allgather. + name: A base name to use for the group allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A handle to the group allgather operation that can be used with `poll()` or + `synchronize()`. + """ + outputs = [t.new() for t in tensors] + return _grouped_allgather_async(tensors, outputs, name, process_set) + + +class HorovodGroupedAllgather(torch.autograd.Function): + """An autograd function that performs allgather on a list of tensors.""" + + @staticmethod + def forward(ctx, name, process_set: ProcessSet, *tensors): + ctx.dims = [t.shape[0] for t in tensors] + ctx.process_set = process_set + handle = grouped_allgather_async(list(tensors), name, process_set) + return synchronize(handle) + + @staticmethod + def backward(ctx, *grad_output): + grad_reduced = grouped_allreduce(list(grad_output), average=True, process_set=ctx.process_set) + + dim_ts = [torch.IntTensor([dim]) for dim in ctx.dims] + dims = [dt.view(ctx.process_set.size()) + for dt in grouped_allgather(dim_ts, process_set=ctx.process_set)] + + r = ctx.process_set.rank() + offsets = [torch.sum(dim.narrow(0, 0, r)).item() if r != 0 else 0 + for dim in dims] + result = [gr.narrow(0, offset, dim) + for gr, offset, dim in zip(grad_reduced, offsets, ctx.dims)] + return (None, None, *result) + + +def grouped_allgather(tensors, name=None, process_set=global_process_set): + """ + A function that concatenates each input tensor with the corresponding + input tensor on all other Horovod processes for a list of input tensors. + The input tensors are not modified. + + The concatenation is done on the first dimension, so the corresponding input + tensors on the different processes must have the same rank and shape, except + for the first dimension, which is allowed to be different. + + Arguments: + tensors: A list of tensors to allgather. + name: A base name to use for the group allgather operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + + Returns: + A list containing tensors of the same type as in `tensors`. Each tensor + is concatenated on dimension zero across all processes. Its shape is + identical to the corresponding input shape, expect for the first + dimension, which may be greater and is the sum of all first dimensions + of the corresponding tensor in different Horovod processes. + """ + return HorovodGroupedAllgather.apply(name, process_set, *tensors) def _broadcast_function_factory(tensor): return 'horovod_torch_broadcast_async_' + tensor.type().replace('.', '_') -def _broadcast_async(tensor, output, root_rank, name): +def _broadcast_async(tensor, output, root_rank, name, process_set: ProcessSet): function = _check_function(_broadcast_function_factory, tensor) try: handle = getattr(mpi_lib, function)( - tensor, output, root_rank, name.encode() if name is not None else _NULL) + tensor, output, root_rank, name.encode() if name is not None else _NULL, + process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) _handle_map[handle] = (tensor, output) return handle -def broadcast_async(tensor, root_rank, name=None): +def broadcast_async(tensor, root_rank, name=None, process_set=global_process_set): """ A function that asynchronously broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The input tensor is not modified. @@ -611,33 +799,36 @@ def broadcast_async(tensor, root_rank, name=None): tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the broadcast operation that can be used with `poll()` or `synchronize()`. """ output = tensor.new(tensor.shape) - return _broadcast_async(tensor, output, root_rank, name) + return _broadcast_async(tensor, output, root_rank, name, process_set) class HorovodBroadcast(torch.autograd.Function): """An autograd function that broadcasts a tensor.""" @staticmethod - def forward(ctx, tensor, root_rank, name): + def forward(ctx, tensor, root_rank, name, process_set: ProcessSet): ctx.root_rank = root_rank - handle = broadcast_async(tensor, root_rank, name) + ctx.process_set = process_set + handle = broadcast_async(tensor, root_rank, name, process_set) return synchronize(handle) @staticmethod def backward(ctx, grad_output): - grad_reduced = allreduce(grad_output, average=False) + grad_reduced = allreduce(grad_output, average=False, process_set=ctx.process_set)#True CHANGED if rank() != ctx.root_rank: grad_reduced *= 0 - return grad_reduced, None, None + return grad_reduced, None, None, None -def broadcast(tensor, root_rank, name=None): +def broadcast(tensor, root_rank, name=None, process_set=global_process_set): """ A function that broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The input tensor is not modified. @@ -655,15 +846,17 @@ def broadcast(tensor, root_rank, name=None): tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, with the value broadcasted from root rank. """ - return HorovodBroadcast.apply(tensor, root_rank, name) + return HorovodBroadcast.apply(tensor, root_rank, name, process_set) -def broadcast_async_(tensor, root_rank, name=None): +def broadcast_async_(tensor, root_rank, name=None, process_set=global_process_set): """ A function that asynchronously broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The operation is performed in-place. @@ -677,15 +870,17 @@ def broadcast_async_(tensor, root_rank, name=None): tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the broadcast operation that can be used with `poll()` or `synchronize()`. """ - return _broadcast_async(tensor, tensor, root_rank, name) + return _broadcast_async(tensor, tensor, root_rank, name, process_set) -def broadcast_(tensor, root_rank, name=None): +def broadcast_(tensor, root_rank, name=None, process_set=global_process_set): """ A function that broadcasts the input tensor on root rank to the same input tensor on all other Horovod processes. The operation is performed in-place. @@ -699,35 +894,37 @@ def broadcast_(tensor, root_rank, name=None): tensor: A tensor to broadcast. root_rank: The rank to broadcast the value from. name: A name of the broadcast operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A tensor of the same shape and type as `tensor`, with the value broadcasted from root rank. """ - handle = broadcast_async_(tensor, root_rank, name) + handle = broadcast_async_(tensor, root_rank, name, process_set) return synchronize(handle) def _alltoall_function_factory(tensor): return 'horovod_torch_alltoall_async_' + tensor.type().replace('.', '_') -def _alltoall_async(tensor, splits, output, name): +def _alltoall_async(tensor, splits, output, output_received_splits, name, process_set: ProcessSet): if splits is None: # If splits not provided, create empty tensor as placeholder splits = torch.tensor([], dtype=torch.int32, device='cpu') elif not isinstance(splits, torch.Tensor): splits = torch.tensor(splits, dtype=torch.int32, device='cpu') - function = _check_function(_alltoall_function_factory, tensor) try: handle = getattr(mpi_lib, function)( - tensor, splits, output, name.encode() if name is not None else _NULL) + tensor, splits, output, output_received_splits, name.encode() if name is not None else _NULL, + process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) - _handle_map[handle] = (tensor, splits, output) + _handle_map[handle] = (tensor, splits, (output, output_received_splits)) return handle -def alltoall_async(tensor, splits=None, name=None): +def alltoall_async(tensor, splits=None, name=None, process_set=global_process_set): """ A function that scatters slices of the input tensor to all other Horovod processes and returns a tensor of gathered slices from all other Horovod processes. The input @@ -745,37 +942,45 @@ def alltoall_async(tensor, splits=None, name=None): not provided, the first dimension is split equally by the number of Horovod processes. name: A name of the alltoall operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: A handle to the alltoall operation that can be used with `poll()` or `synchronize()`. """ output = tensor.new() - return _alltoall_async(tensor, splits, output, name) + if isinstance(splits, torch.Tensor): + output_received_splits = splits.new() + else: + output_received_splits = torch.empty(size(), dtype=torch.int32, device='cpu') + return _alltoall_async(tensor, splits, output, output_received_splits, name, process_set) class HorovodAlltoall(torch.autograd.Function): """An autograd function that performs alltoall on a tensor.""" @staticmethod - def forward(ctx, tensor, splits, name): - ctx.tensor = tensor - ctx.splits = splits - handle = alltoall_async(tensor, splits, name) - return synchronize(handle) + def forward(ctx, tensor, splits, name, process_set: ProcessSet): + handle = alltoall_async(tensor, splits, name, process_set) + output, received_splits = synchronize(handle) + + ctx.process_set = process_set + ctx.recvsplits = received_splits + if splits is None: + return output + else: + ctx.mark_non_differentiable(received_splits) + return output, received_splits @staticmethod - def backward(ctx, grad_output): - recvsplits = None - if ctx.splits is not None: - recvsplits = alltoall(ctx.splits, splits=torch.ones(size(), dtype=torch.int32, device='cpu')) - else: - splits_equal = torch.ones(size(), dtype=torch.int32, device='cpu') * (ctx.tensor.size()[0] // size()) - recvsplits = alltoall(splits_equal, splits=torch.ones(size(), dtype=torch.int32, device='cpu')) - return alltoall(grad_output, splits=recvsplits), None, None + def backward(ctx, grad_output, *dead_gradients): + grad_wrt_tensor, _ = alltoall(grad_output, splits=ctx.recvsplits, + process_set=ctx.process_set) + return grad_wrt_tensor, None, None, None -def alltoall(tensor, splits=None, name=None): +def alltoall(tensor, splits=None, name=None, process_set=global_process_set): """ A function that scatters slices of the input tensor to all other Horovod processes and returns a tensor of gathered slices from all other Horovod processes. The input @@ -797,22 +1002,263 @@ def alltoall(tensor, splits=None, name=None): not provided, the first dimension is split equally by the number of Horovod processes. name: A name of the alltoall operation. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. Returns: - A tensor containing the gathered tensor data from all workers. + 1) A tensor containing the gathered tensor data from all workers. + 2) If `splits` has been provided: A tensor of integers in rank order + describing how many elements in the output tensor have been received + from each worker. + """ + return HorovodAlltoall.apply(tensor, splits, name, process_set) + + +def _reducescatter_function_factory(tensor): + return 'horovod_torch_reducescatter_async_' + tensor.type().replace('.', '_') + + +def _reducescatter_async(tensor, output, name, op, process_set: ProcessSet, + prescale_factor, postscale_factor): + function = _check_function(_reducescatter_function_factory, tensor) + try: + handle = getattr(mpi_lib, function)(tensor, output, + name.encode() if name is not None else _NULL, + op, process_set.process_set_id, + prescale_factor, postscale_factor) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tensor, output) + return handle + + +def reducescatter_async(tensor, name=None, op=Average, process_set=global_process_set, + prescale_factor=1.0, postscale_factor=1.0): """ - return HorovodAlltoall.apply(tensor, splits, name) + A function that performs asynchronous reduction of the input tensor over all the + Horovod processes, then scatters the results across all Horovod processes. The + input tensor is not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + The input tensors on the different processes must have the same rank and shape. The + output tensor will be the same rank on all processes, but the first dimension may + be different. + + Arguments: + tensor: A tensor to average and sum. + name: A name of the reduction operation. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensor before reducescatter. + postscale_factor: Multiplicative factor to scale tensor after reducescatter. + + Returns: + A handle to the reducescatter operation that can be used with `poll()` or + `synchronize()`. + """ + output = tensor.new() + return _reducescatter_async(tensor, output, name, op, process_set, + prescale_factor, postscale_factor) + + +class HorovodReducescatter(torch.autograd.Function): + """An autograd function that performs reducescatter on a tensor.""" + + @staticmethod + def forward(ctx, tensor, name, op, process_set, prescale_factor, postscale_factor): + ctx.op = op + ctx.process_set = process_set + ctx.prescale_factor = prescale_factor + ctx.postscale_factor = postscale_factor + handle = reducescatter_async(tensor, name, op, process_set, prescale_factor, postscale_factor) + return synchronize(handle) + + @staticmethod + def backward(ctx, grad_output): + if ctx.op == Sum: + grad_output *= ctx.process_set.size() + if ctx.prescale_factor != 1.0: + grad_output *= ctx.prescale_factor + if ctx.postscale_factor != 1.0: + grad_output *= ctx.postscale_factor + + return allgather(grad_output, process_set=ctx.process_set), None, None, None, None, None + + +def reducescatter(tensor, name=None, compression=Compression.none, op=Average, + process_set=global_process_set, prescale_factor=1.0, postscale_factor=1.0): + """ + A function that performs reduction of the input tensor over all the Horovod + processes, then scatters the results across all Horovod processes. The input tensor + is not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensor: A tensor to average/sum and scatter. + name: A name of the reduction operation. + compression: Compression algorithm used during reducescatter to reduce the amount + of data sent during the each parameter update step. Defaults to + not using compression. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensor before reducescatter. + postscale_factor: Multiplicative factor to scale tensor after reducescatter. + + Returns: + A tensor of the same rank and type as `tensor` across all processes. The shape + is identical to the input shape, except for the first dimension, which will be + divided across the different Horovod processes. + """ + tensor_compressed, ctx = compression.compress(tensor) + reduced_tensor_compressed = HorovodReducescatter.apply(tensor_compressed, name, op, process_set, + prescale_factor, postscale_factor) + return compression.decompress(reduced_tensor_compressed, ctx) + + +def _grouped_reducescatter_function_factory(tensor): + return 'horovod_torch_grouped_reducescatter_async_' + tensor.type().replace('.', '_') + + +def _grouped_reducescatter_async(tensors, outputs, name, op, process_set: ProcessSet, + prescale_factor, postscale_factor): + function = _check_function(_grouped_reducescatter_function_factory, tensors[0]) + try: + handle = getattr(mpi_lib, function)(tensors, outputs, + name.encode() if name is not None else _NULL, + op, process_set.process_set_id, + prescale_factor, postscale_factor) + except RuntimeError as e: + raise HorovodInternalError(e) + _handle_map[handle] = (tuple(tensors), tuple(outputs)) + return handle + + +def grouped_reducescatter_async(tensors, name=None, op=Average, process_set=global_process_set, + prescale_factor=1.0, postscale_factor=1.0): + """ + A function that performs asynchronous reduction of a list of input tensors over all the + Horovod processes, then scatters the results across all Horovod processes. The + input tensors are not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. For each of the input tensors, the type and shape must + be the same on all Horovod processes for a given name. The reduction will not start + until all processes are ready to send and receive the tensors. + + The input tensor at some place in the list tensor must have the same rank and shape + on the different processes. The corresponding output tensor will be the same rank on + all processes, but the first dimension may be different. + + Arguments: + tensors: A list of tensors to average and sum. + name: A base name to use for the group reduction operation. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensors before reducescatter. + postscale_factor: Multiplicative factor to scale tensors after reducescatter. + + Returns: + A handle to the group reducescatter operation that can be used with `poll()` or + `synchronize()`. + """ + outputs = [t.new() for t in tensors] + return _grouped_reducescatter_async(tensors, outputs, name, op, process_set, + prescale_factor, postscale_factor) + + +class HorovodGroupedReducescatter(torch.autograd.Function): + """An autograd function that performs reducescatter on a list of tensors.""" + + @staticmethod + def forward(ctx, name, op, process_set, prescale_factor, postscale_factor, *tensors): + ctx.op = op + ctx.process_set = process_set + ctx.prescale_factor = prescale_factor + ctx.postscale_factor = postscale_factor + handle = grouped_reducescatter_async(list(tensors), name, op, process_set, + prescale_factor, postscale_factor, ) + return synchronize(handle) + + @staticmethod + def backward(ctx, *grad_output): + if ctx.op == Sum: + grad_output = [g * ctx.process_set.size() for g in grad_output] + if ctx.prescale_factor != 1.0: + grad_output = [ctx.prescale_factor * g for g in grad_output] + if ctx.postscale_factor != 1.0: + grad_output = [ctx.postscale_factor * g for g in grad_output] + + return (None, None, None, None, None, + *grouped_allgather(grad_output, process_set=ctx.process_set)) + + +def grouped_reducescatter(tensors, name=None, compression=Compression.none, op=Average, + process_set=global_process_set, prescale_factor=1.0, postscale_factor=1.0): + """ + A function that performs reduction of a list of input tensors over all the + Horovod processes, then scatters the results across all Horovod processes. The + input tensors are not modified. + + The reduction operation is keyed by the name. If name is not provided, an incremented + auto-generated name is used. The tensor type and shape must be the same on all + Horovod processes for a given name. The reduction will not start until all processes + are ready to send and receive the tensor. + + Arguments: + tensors: A list of tensors to average and sum. + name: A base name to use for the group reduction operation. + compression: Compression algorithm used during reducescatter to reduce the amount + of data sent during the each parameter update step. Defaults to + not using compression. + op: The reduction operation to combine tensors across different ranks. + Defaults to Average. + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + prescale_factor: Multiplicative factor to scale tensors before reducescatter. + postscale_factor: Multiplicative factor to scale tensors after reducescatter. + + + Returns: + A list containing tensors of the same rank and type as in `tensors`. For each + tensor the shape is identical to the input shape, except for the first dimension, + which will be divided across the different Horovod processes. + """ + tensors_compressed = [] + ctxs = [] + for tensor in tensors: + tensor_compressed, ctx = compression.compress(tensor) + tensors_compressed.append(tensor_compressed) + ctxs.append(ctx) + reduced_tensors_compressed = HorovodGroupedReducescatter.apply(name, op, process_set, + prescale_factor, postscale_factor, + *tensors_compressed) + return [compression.decompress(reduced_tensor_compressed, ctx) + for reduced_tensor_compressed, ctx in zip(reduced_tensors_compressed, ctxs)] def poll(handle): """ - Polls an allreduce, allgather or broadcast handle to determine whether underlying - asynchronous operation has completed. After `poll()` returns `True`, `synchronize()` - will return without blocking. + Polls an allreduce, allgather, alltoall, broadcast, or reducescatter handle to determine whether + underlying asynchronous operation has completed. After `poll()` returns `True`, + `synchronize()` will return without blocking. Arguments: - handle: A handle returned by an allreduce, allgather or broadcast asynchronous - operation. + handle: A handle returned by an allreduce, allgather, alltoall, broadcast, or reducescatter + asynchronous operation. Returns: A flag indicating whether the operation has completed. @@ -822,15 +1268,15 @@ def poll(handle): def synchronize(handle): """ - Synchronizes an asynchronous allreduce, allgather or broadcast operation until - it's completed. Returns the result of the operation. + Synchronizes an asynchronous allreduce, allgather, alltoall, broadcast, or reducescatter operation + until it's completed. Returns the result of the operation. Arguments: - handle: A handle returned by an allreduce, allgather or broadcast asynchronous - operation. + handle: A handle returned by an allreduce, allgather, alltoall, broadcast, or reducescatter + asynchronous operation. Returns: - An output tensor of the operation. + A single output tensor of the operation or a tuple of multiple output tensors. """ if handle not in _handle_map: return @@ -840,10 +1286,11 @@ def synchronize(handle): output = _handle_map.pop(handle)[-1] return output except RuntimeError as e: + _handle_map.pop(handle, None) raise HorovodInternalError(e) -def join(device=-1): +def join(device=-1) -> int: """A function that indicates that the rank finished processing data. All ranks that did not call join() continue to process allreduce operations. @@ -855,7 +1302,32 @@ def join(device=-1): Returns: Id of the rank that joined last. """ + output = torch.tensor(-1, dtype=torch.int, device=torch.device("cpu")) + try: + handle = mpi_lib.horovod_torch_join(output, device) + except RuntimeError as e: + raise HorovodInternalError(e) + + _handle_map[handle] = (None, output) + + return synchronize(handle).item() + +def barrier(process_set=global_process_set): + """ + A function that acts as a simple sychronization point for ranks specified + in the given process group(default to global group). Ranks that reach + this function call will stall until all other ranks have reached. + + Arguments: + process_set: Process set object to limit this operation to a subset of + Horovod processes. Default is the global process set. + """ + try: - return mpi_lib.horovod_torch_join(device) + handle = mpi_lib.horovod_torch_barrier(process_set.process_set_id) except RuntimeError as e: raise HorovodInternalError(e) + + _handle_map[handle] = (None, None) + + synchronize(handle) diff --git a/patch_files/horovod/torch/optimizer.py b/patch_files/horovod/torch/optimizer.py index ffe7d42..7e3f3b1 100644 --- a/patch_files/horovod/torch/optimizer.py +++ b/patch_files/horovod/torch/optimizer.py @@ -25,28 +25,32 @@ from horovod.torch.compression import Compression from horovod.torch.functions import broadcast_object -from horovod.torch.mpi_ops import allreduce_async_, grouped_allreduce_async_ +from horovod.torch.mpi_ops import allreduce_async_, grouped_allreduce_async_, sparse_allreduce_async from horovod.torch.mpi_ops import synchronize from horovod.torch.mpi_ops import size from horovod.torch.mpi_ops import Average, Adasum, Sum from horovod.torch.mpi_ops import rocm_built +from horovod.torch.mpi_ops import ProcessSet, global_process_set class _DistributedOptimizer(torch.optim.Optimizer): - def __init__(self, params, named_parameters, grace, compression, + def __init__(self, params, named_parameters,grace, compression, backward_passes_per_step=1, op=Average, gradient_predivide_factor=1.0, - num_groups=0): + groups=None, + sparse_as_dense=False, + process_set=global_process_set): super(self.__class__, self).__init__(params) - self._compression = compression self._grace = grace + self._process_set=process_set + self._compression = compression if named_parameters is not None: named_parameters = list(named_parameters) else: - named_parameters = [('allreduce.noname.%s' % i, v) - for param_group in self.param_groups - for i, v in enumerate(param_group['params'])] + named_parameters = [(f'allreduce.noname.{i}.{j}', v) + for i, param_group in enumerate(self.param_groups) + for j, v in enumerate(param_group['params'])] # make sure that named_parameters are tuples if any([not isinstance(p, tuple) for p in named_parameters]): raise ValueError('named_parameters should be a sequence of ' @@ -74,15 +78,33 @@ def __init__(self, params, named_parameters, grace, compression, for _, v in sorted(named_parameters)} self.op = op self.gradient_predivide_factor = gradient_predivide_factor + self.sparse_as_dense = sparse_as_dense + self.process_set = process_set + self._handles = {} self._grad_accs = [] self._requires_update = set() self._synchronized = False self._should_synchronize = True - self._num_groups = num_groups + + if groups is not None: + if not (isinstance(groups, list) or groups > 0): + raise ValueError('groups should be a non-negative integer or ' + 'a list of list of torch.Tensor.') + if isinstance(groups, list): + grouped_parameter_ids = set() + for l in groups: + for p in l: + if not isinstance(p, torch.Tensor): + raise ValueError('groups must consist of torch.Tensor.') + if id(p) in grouped_parameter_ids: + raise ValueError('A parameter can only appear once in groups.') + grouped_parameter_ids.add(id(p)) + self._groups = groups self._p_to_group = {} self._group_counts = {} - if size() > 1 or os.environ.get('HOROVOD_ELASTIC') == '1': + + if self.process_set.included() and (size() > 1 or os.environ.get('HOROVOD_ELASTIC') == '1'): self._register_hooks() def load_state_dict(self, *args, **kwargs): @@ -109,8 +131,7 @@ def set_backward_passes_per_step(self, passes): self._allreduce_delay[p] = self.backward_passes_per_step def _register_hooks(self): - - if self._num_groups > 0: + if self._groups is not None: p_list = [] # Get list of parameters with grads for param_group in self.param_groups: @@ -121,16 +142,29 @@ def _register_hooks(self): # To ensure parameter order and group formation is consistent, broadcast p_list order # from rank 0 and use for every worker p_list_names = [self._parameter_names.get(p) for p in p_list] - p_list_names = broadcast_object(p_list_names, root_rank=0) - p_list = sorted(p_list, key=lambda p : p_list_names.index(self._parameter_names.get(p))) + p_list_names = broadcast_object(p_list_names, root_rank=0, process_set=self.process_set) + p_list = sorted(p_list, key=lambda p: p_list_names.index(self._parameter_names.get(p))) # Form groups - p_groups = split_list(p_list, self._num_groups) + if isinstance(self._groups, list): + p_groups = [] + grouped_id = set() + p_list_ids = [id(p) for p in p_list] + for group in self._groups: + p_groups.append([p for p in group if id(p) in p_list_ids]) + for p in p_groups[-1]: + grouped_id.add(id(p)) + for p in p_list: + if id(p) not in grouped_id: + p_groups.append([p]) + else: + p_groups = split_list(p_list, self._groups) + p_groups = [tuple(p) for p in p_groups] for group in p_groups: - for p in group: - self._p_to_group[p] = group - self._group_counts[group] = 0 + for p in group: + self._p_to_group[p] = group + self._group_counts[group] = 0 for param_group in self.param_groups: for p in param_group['params']: @@ -143,16 +177,33 @@ def _register_hooks(self): self._grad_accs.append(grad_acc) def _allreduce_grad_async(self, p): + if p.grad is None: + # Gradient was not computed, but we still need to submit a tensor to allreduce + # as one of the other ranks may have computed it (due to dynamic forward functions). + # + # NOTE: this will not work if the gradient is sparse and we perform an allgather. + # Unfrotunately, there doesn't appear to be a good way to detect that the parameter will + # produce sparse gradients before computing the gradient. + p.grad = p.data.new(p.size()).zero_() + name = self._parameter_names.get(p) tensor = p.grad - if self._grace and self._num_groups == 0 and self.op == Average: - handle, ctx = self._grace.send_step(tensor, name) + + if p.grad.is_sparse: + if self.sparse_as_dense: + tensor = tensor.to_dense() + else: + return self._sparse_allreduce_grad_async(p, name) + if self._grace and self._groups is None and self.op == Average: + handle, ctx = self._grace.send_step(tensor, name,self._process_set) + postscale_factor = self.gradient_predivide_factor else: + tensor_compressed, ctx = self._compression.compress(tensor) if self.op == Average: - # Split average operation across pre/postscale factors - # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. + # Split average operation across pre/postscale factors + # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. prescale_factor = 1.0 / self.gradient_predivide_factor postscale_factor = self.gradient_predivide_factor else: @@ -160,17 +211,24 @@ def _allreduce_grad_async(self, p): postscale_factor = 1.0 handle = allreduce_async_(tensor_compressed, name=name, op=self.op, - prescale_factor=prescale_factor, - postscale_factor=postscale_factor) + prescale_factor=prescale_factor, + postscale_factor=postscale_factor, + process_set=self.process_set) return handle, ctx def _grouped_allreduce_grad_async(self, ps): name = self._parameter_names.get(ps[0]) tensors_compressed, ctxs = zip(*[self._compression.compress(p.grad) for p in ps]) - handle = grouped_allreduce_async_(tensors_compressed, name=name, op=self.op) + handle = grouped_allreduce_async_(tensors_compressed, name=name, op=self.op, + process_set=self.process_set) return handle, ctxs + def _sparse_allreduce_grad_async(self, p, name): + handle = sparse_allreduce_async(p.grad, name=name, op=self.op, + process_set=self.process_set) + return handle, None + def _make_hook(self, p): def hook(*ignore): if p in self._handles and self._handles[p][0] is not None: @@ -185,7 +243,7 @@ def hook(*ignore): handle, ctx = None, None self._allreduce_delay[p] -= 1 if self._allreduce_delay[p] == 0: - if self._num_groups > 0: + if self._groups is not None: group = self._p_to_group[p] self._group_counts[group] += 1 if self._group_counts[group] == len(group): @@ -202,6 +260,10 @@ def hook(*ignore): return hook def synchronize(self): + if not self.process_set.included(): + self._synchronized = True + return + completed = set() for x in self._handles.keys(): completed.update(x) if isinstance(x, tuple) else completed.add(x) @@ -214,23 +276,36 @@ def synchronize(self): if handle is None: handle, ctx = self._allreduce_grad_async(p) self._handles[p] = (handle, ctx) - for p, (handle, ctx) in self._handles.items(): + if isinstance(p, tuple): # This was a grouped result, need to unpack outputs = synchronize(handle) for gp, output, gctx in zip(p, outputs, ctx): self._allreduce_delay[gp] = self.backward_passes_per_step gp.grad.set_(self._compression.decompress(output, gctx)) + if self._groups is not None and self._group_counts[p] != 0: + self._group_counts[p] = 0 else: - if self._grace and self._num_groups == 0 and self.op == Average: - # in GRACE, p is not tuple, but handle is. - output = self._grace.receive_step(handle, ctx) - self._allreduce_delay[p] = self.backward_passes_per_step - p.grad.set_(output) + # When handle is a callable function, it returns the aggregated tensor result + if self._grace and self._groups is None and self.op == Average: + output = self._grace.receive_step(handle, ctx,self._process_set) + p.grad.set_(output) else: - output = synchronize(handle) + output = synchronize(handle) if not callable(handle) else handle() self._allreduce_delay[p] = self.backward_passes_per_step + if self._groups is not None: + group = self._p_to_group[p] + if self._group_counts[group] != 0: + self._group_counts[group] = 0 + if p.grad.is_sparse: + aggregated = self._compression.decompress(output, ctx) + if not aggregated.is_sparse: + # When sparse_as_dense=True we need to convert the grad back to sparse before update + aggregated = aggregated.to_sparse() + + # Sparse grads do not support set_ for some reason, so we do this as an equivalent + p.grad.zero_().add_(aggregated) p.grad.set_(self._compression.decompress(output, ctx)) self._handles.clear() @@ -448,13 +523,14 @@ def zero_grad(self): return super(self.__class__, self).zero_grad() -def DistributedOptimizer(optimizer, - grace=None, named_parameters=None, +def DistributedOptimizer(optimizer, named_parameters=None,grace=None, compression=Compression.none, backward_passes_per_step=1, op=Average, gradient_predivide_factor=1.0, - num_groups=0): + num_groups=0, groups=None, + sparse_as_dense=False, + process_set=global_process_set): """ An optimizer that wraps another torch.optim.Optimizer, using an allreduce to combine gradient values before applying gradients to model weights. @@ -499,6 +575,18 @@ def DistributedOptimizer(optimizer, gradient_predivide_factor / size after the sum. num_groups: Number of groups to assign gradient allreduce ops to for explicit grouping. Defaults to no explicit groups. + groups: The parameter to group the gradient allreduce ops. Accept values is a + non-negative integer or a list of list of torch.Tensor. + If groups is a non-negative integer, it is the number of groups to assign + gradient allreduce ops to for explicit grouping. + If groups is a list of list of torch.Tensor. Tensors in the same + inner list will be assigned to the same group, while parameter that does + not appear in any list will form a group itself. + Defaults as None, which is no explicit groups. + sparse_as_dense: If set True, convert all sparse gradients to dense and perform allreduce, then + convert back to sparse before applying the update. + process_set: Gradients will only be reduced over Horovod processes belonging + to this process set. Defaults to the global process set. """ # We dynamically create a new class that inherits from the optimizer that was passed in. # The goal is to override the `step()` method with an allreduce implementation. @@ -508,12 +596,23 @@ def DistributedOptimizer(optimizer, if op != Average: raise ValueError('gradient_predivide_factor not supported with op != Average') + if num_groups != 0: + warnings.warn('Parameter `num_groups` has been replaced by `groups` ' + 'and will be removed in v0.23.0.', DeprecationWarning) + if groups is None: + groups = num_groups + + if backward_passes_per_step <= 0: + raise ValueError("backward_passes_per_step must be > 0") + if op != Adasum or size() == 1: cls = type(optimizer.__class__.__name__, (optimizer.__class__,), dict(_DistributedOptimizer.__dict__)) - return cls(optimizer.param_groups, named_parameters, grace, compression, backward_passes_per_step, op, - gradient_predivide_factor, num_groups) + return cls(optimizer.param_groups, named_parameters,grace, compression, backward_passes_per_step, op, + gradient_predivide_factor, groups, sparse_as_dense, process_set) else: + if process_set != global_process_set: + raise NotImplementedError("Adasum does not support non-global process sets yet.") cls = type(optimizer.__class__.__name__, (optimizer.__class__,), dict(_DistributedAdasumOptimizer.__dict__)) return cls(optimizer.param_groups, named_parameters, compression, backward_passes_per_step) From 31280b94101088f394836a29dea2296de1d4d4c3 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:04:42 -0400 Subject: [PATCH 13/44] Delete .DS_Store --- .DS_Store | Bin 8196 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 5043af2638c57c7830bb83a3d0f38dce8dfe20ca..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMyN=U96upxT>_ibL1q-{KB?_7#k`=oM(U=f^03#>>1v?Ijh2ybehag3f+(XZ| z(4&>;sOW(TzJY=}j}Si>D#Y&2NHfRroI9R5C$W7e06=(V_I#I_fVW+K3IM@V`4 z*b7{?=&^a=Mk3ad9kf9kgVx@1dHt~4Huev@tG2Q1c5dyrjhnX*R;vcxxq9QygTYIG z8n8EVEkxZ9;6iR~Z_5_uMnm7hiE?gX)&96^4cS!gT##?`GR~Cs`$`#lusVva>>b@L z=ed8y3n!^-)=V6#IImaMEbv({#y-AS3S*d|TSIhh0514^x8-!Wm0MWWtz2iuE#sYC zrcBzc4`0HMm)CCP#+tH?TgUM-Ze0^!l!6;Mx0?8yvLQydCg|1+@HRVSr@^y|<)xjg zpk(|bUq7LL70k4YKYotBUz9FZvNvA-nVoU8Cf*$?#?ASP)73>QaH191lC$6C`Tzau z{r?laMm>40fL7pm1xUSb^?S%!zf0%oJl8g`KF7)n=Oq$F2sR=eM-=He;_p8U@z_8r ck1-V{67dX{f4&irc@NXie_?`d&YS|j0M$@mbpQYW From 5a2691fe5dfe92da316d00e226bfeabc63c4d2ac Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:04:57 -0400 Subject: [PATCH 14/44] Delete patch_files/.DS_Store --- patch_files/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 patch_files/.DS_Store diff --git a/patch_files/.DS_Store b/patch_files/.DS_Store deleted file mode 100644 index 6c2a8c29066cc5b66b19eb9f8e0b1de8b7f7ab1d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}T>S5Z-O8O(;SS3VK`cTCpu!5ig$&Z@2A=4emE05R|@2Jn6mpoosaT%)==U{Eap zUV6IW8Gj5s>Zr;pIhr-p{VSS;( z8Fw_&NDL4IUm3vO52TLu|Lo`fUj@-X3=jjW$pEkP++G*;fl%j=@|bSU~8HfTDp0V&G31cn9cUQ&Ru{ From 9f7c3e66b90c694162f895a51ac9f37aca59596a Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:05:08 -0400 Subject: [PATCH 15/44] Delete patch_files/horovod/.DS_Store --- patch_files/horovod/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 patch_files/horovod/.DS_Store diff --git a/patch_files/horovod/.DS_Store b/patch_files/horovod/.DS_Store deleted file mode 100644 index 43d7721ef476f915abed16f57aeeddb43526b4df..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}(1u5S|SQ*??N*K&2w}uu`w7h*1kdTuew`09W+TR;e8aV&QnB*r^XuB;VD{Waynh3KC<^AV$oW&%K5Y zs;dvfAWz3ZECZb8IvB@Mnzh@vtg=?Et=D;-Z}M;6Q089dXVbLfPrj(uC;WVZMcofh zqe0$lZXL@c^P^-i(E(97K$owdqa>7hM^2M4)v=!G;0@mBHMeK8cY97t?Cv@9mY6y1 z{oR&$fAC>GZ}3-dcMeZ_7x5^Om*!v?e#2T`SX{tQtSoKmnVrVxj@Or?v5ES%vl=v# z-LB<vcQ0CHuCPHmBZNf%+X)iSjE9K7?RJKgH;)p5oW2TF@?31<|!w7(@#S{Rn6p Ks2~RZC Date: Mon, 22 Jul 2024 01:05:16 -0400 Subject: [PATCH 16/44] Delete patch_files/horovod/torch/.DS_Store --- patch_files/horovod/torch/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 patch_files/horovod/torch/.DS_Store diff --git a/patch_files/horovod/torch/.DS_Store b/patch_files/horovod/torch/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Mon, 22 Jul 2024 01:05:31 -0400 Subject: [PATCH 17/44] Delete grace_dl/.DS_Store --- grace_dl/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 grace_dl/.DS_Store diff --git a/grace_dl/.DS_Store b/grace_dl/.DS_Store deleted file mode 100644 index c1af8e2c1f8a48d3ff3fa6980a8d556babf7ee8f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK&5qMB5FU54Y`QC;5+LXW8SS-*cDKuF#U+$-;e-f+13#r{$~Gd6t0tv~s#5N7 zKuEj>;>ru`6Ckdfc>wOr*e)d76b?v$z$4k85x|lI}T!$Y=~d?uDKZ;ZmgOFbo(5{zeAq+qJ-l0Afh8-|x51@UY?b#Be0v{^FmONZ&00UPVQ<)5_9J&7rf$+rCUL7fen3xmaU=2;?XGt(=ubP9vo}PPbc3ir zmI;C1$B>8jgUA=tmY76-EOTAuu*-J2Q>o5o&D~~=H}-euH9l+9_jhW%d9XL1m+f=g zSFYXZJPU`BctLNE8zJt5(oSkzz)NftE-uS@VL(m+cKXe#fwF3RNWQ$SDm4nn9E$JC z{46%7Qpv&Tix=|T9Y>*v4$)5}tfP{oVGjm>TEEMC2Q|KPwehR^WtOy1?bu3lTEVMJ zc^>L19KFb+Av@k$TP5}9xHVI-&}qF&nKBD!+>GnpJISnZMWURCF!$uX>oMc vax71 Date: Mon, 22 Jul 2024 01:05:40 -0400 Subject: [PATCH 18/44] Delete grace_dl/torch/.DS_Store --- grace_dl/torch/.DS_Store | Bin 8196 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 grace_dl/torch/.DS_Store diff --git a/grace_dl/torch/.DS_Store b/grace_dl/torch/.DS_Store deleted file mode 100644 index b5434247f035dca45fcad5190bd1f743e33b494b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMO>fgc5S?v9>(Bx@P^rCH;u=I!K@s9&LbxN0-~cGtvDI2QUdwifswk50@Q1i^ z;3vS9Bm4ryjoFW;wi5^Sf@nJ%?LKGsjd%Aw+0Hlszje3az43n7^GRJ z^OdcwH?C}K8k@$J@y0&l6FYOVaoTf|SK`}q&O`U)*l}M3{$$X)`ItwU6GVO@0|L)S z%IlXw Date: Mon, 22 Jul 2024 01:05:49 -0400 Subject: [PATCH 19/44] Delete grace_dl/torch/communicator/.DS_Store --- grace_dl/torch/communicator/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 grace_dl/torch/communicator/.DS_Store diff --git a/grace_dl/torch/communicator/.DS_Store b/grace_dl/torch/communicator/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Mon, 22 Jul 2024 01:05:58 -0400 Subject: [PATCH 20/44] Delete grace_dl/torch/compressor/.DS_Store --- grace_dl/torch/compressor/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 grace_dl/torch/compressor/.DS_Store diff --git a/grace_dl/torch/compressor/.DS_Store b/grace_dl/torch/compressor/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Mon, 22 Jul 2024 01:06:07 -0400 Subject: [PATCH 21/44] Delete grace_dl/torch/memory/.DS_Store --- grace_dl/torch/memory/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 grace_dl/torch/memory/.DS_Store diff --git a/grace_dl/torch/memory/.DS_Store b/grace_dl/torch/memory/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Mon, 22 Jul 2024 01:06:35 -0400 Subject: [PATCH 22/44] Update allgather.py --- grace_dl/torch/communicator/allgather.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/grace_dl/torch/communicator/allgather.py b/grace_dl/torch/communicator/allgather.py index ff95a6c..0b7b305 100644 --- a/grace_dl/torch/communicator/allgather.py +++ b/grace_dl/torch/communicator/allgather.py @@ -39,13 +39,9 @@ def async_send(self, tensors_compressed, name,process_set): def wait_receive(self, result, ctx,process_set): handles, tensor_sizes = result - # print("Result",result) tensors_ag = [] for handle, sizes in zip(handles, tensor_sizes): gathered = synchronize(handle) - # print("gathered",gathered) - # print("gathered",len(gathered)) - # print("sizes",sizes) tensors_ag.append(gathered.split(sizes)) # import time # time.sleep(0.001) From d42bb5a6552a7ed1a2cd6601fe2d10c40e6aec24 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:07:01 -0400 Subject: [PATCH 23/44] Update topk.py --- grace_dl/torch/compressor/topk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grace_dl/torch/compressor/topk.py b/grace_dl/torch/compressor/topk.py index 9ccfc2d..38e36a0 100644 --- a/grace_dl/torch/compressor/topk.py +++ b/grace_dl/torch/compressor/topk.py @@ -22,7 +22,7 @@ def desparsify(tensors, numel): class TopKCompressor(Compressor): def __init__(self, compress_ratio): - super().__init__()#tensors_size_are_same=False + super().__init__() self.compress_ratio = compress_ratio def compress(self, tensor, name): From e60a421d3f136d299c40dd9449fc14c1f2a05717 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:07:17 -0400 Subject: [PATCH 24/44] Update residual.py --- grace_dl/torch/memory/residual.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/grace_dl/torch/memory/residual.py b/grace_dl/torch/memory/residual.py index 6054d25..55819a6 100644 --- a/grace_dl/torch/memory/residual.py +++ b/grace_dl/torch/memory/residual.py @@ -9,15 +9,12 @@ def __init__(self, beta=1.0, gamma=1.0): def compensate(self, tensor, name): """Update the tensor with the residuals.""" - # print("inside residuals compensate") if name in self.residuals: tensor = self.beta * self.residuals[name] + self.gamma * tensor - # print("inside residuals",self.residuals[name] ) return tensor def update(self, tensor, name, compressor, tensor_compressed, ctx): """Update the residuals.""" tensor_decompressed = compressor.decompress(tensor_compressed, ctx) residual = tensor - tensor_decompressed - # print("residual update",residual) self.residuals[name] = residual From 0146a96623c961158e1e45e424eaf0779a80a42d Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:08:10 -0400 Subject: [PATCH 25/44] Update __init__.py --- grace_dl/torch/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/grace_dl/torch/__init__.py b/grace_dl/torch/__init__.py index 12900df..4030ee4 100644 --- a/grace_dl/torch/__init__.py +++ b/grace_dl/torch/__init__.py @@ -36,23 +36,23 @@ def aggregate(self, tensors): class Communicator(ABC): @abstractmethod - def async_send(self, tensors, name): + def async_send(self, tensors, name,process_set): raise NotImplemented("async_send was not implemented.") @abstractmethod - def wait_receive(self, handles, ctx): + def wait_receive(self, handles, ctx,process_set): raise NotImplemented("wait_receive was not implemented.") def __init__(self, compressor, memory): self.compressor = compressor self.memory = memory - def send_step(self, tensor, name): + def send_step(self, tensor, name,process_set): tensor = self.memory.compensate(tensor, name) tensors_compressed, ctx = self.compressor.compress(tensor, name) self.memory.update(tensor, name, self.compressor, tensors_compressed, ctx) - handles = self.async_send(tensors_compressed, name) + handles = self.async_send(tensors_compressed, name,process_set) return handles, ctx - def receive_step(self, handles, ctx): - return self.wait_receive(handles, ctx) + def receive_step(self, handles, ctx,process_set): + return self.wait_receive(handles, ctx,process_set) From 3944638a2cc2feb408bca0b3089e1f8bded21537 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:10:49 -0400 Subject: [PATCH 26/44] Update helper.py --- grace_dl/torch/helper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/grace_dl/torch/helper.py b/grace_dl/torch/helper.py index a59a6b9..e316c80 100644 --- a/grace_dl/torch/helper.py +++ b/grace_dl/torch/helper.py @@ -1,6 +1,6 @@ -def grace_from_params(params): - import horovod.torch as hvd - world_size = hvd.size() +from horovod.torch.mpi_ops import global_process_set +def grace_from_params(params,processset=global_process_set.size()): + world_size = processset.size() comp = params.get('compressor', 'none') mem = params.get('memory', 'none') comm = params.get('communicator', 'allreduce') From 55718c6afb6f011e7337bfcc15c1ac839e8f280d Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:13:08 -0400 Subject: [PATCH 27/44] Update allgather.py --- grace_dl/torch/communicator/allgather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grace_dl/torch/communicator/allgather.py b/grace_dl/torch/communicator/allgather.py index 0b7b305..f355874 100644 --- a/grace_dl/torch/communicator/allgather.py +++ b/grace_dl/torch/communicator/allgather.py @@ -1,6 +1,6 @@ import torch -from grace.grace_dl.torch import Communicator +from grace_dl.torch import Communicator from horovod.torch import allgather, allgather_async, synchronize From c53fb02dba47f146d66fe29ed7e7cd7c103c065e Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:13:22 -0400 Subject: [PATCH 28/44] Update topk.py --- grace_dl/torch/compressor/topk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grace_dl/torch/compressor/topk.py b/grace_dl/torch/compressor/topk.py index 38e36a0..381d4ec 100644 --- a/grace_dl/torch/compressor/topk.py +++ b/grace_dl/torch/compressor/topk.py @@ -1,6 +1,6 @@ import torch -from grace.grace_dl.torch import Compressor +from grace_dl.torch import Compressor def sparsify(tensor, compress_ratio): From 349b46671ead2e83758783c8179938b971ae7655 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:13:36 -0400 Subject: [PATCH 29/44] Update residual.py --- grace_dl/torch/memory/residual.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grace_dl/torch/memory/residual.py b/grace_dl/torch/memory/residual.py index 55819a6..a2ae61f 100644 --- a/grace_dl/torch/memory/residual.py +++ b/grace_dl/torch/memory/residual.py @@ -1,4 +1,4 @@ -from grace.grace_dl.torch import Memory +from grace_dl.torch import Memory class ResidualMemory(Memory): From e847b2f6a7756a705ec120154448aa00176130b4 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:14:16 -0400 Subject: [PATCH 30/44] Update helper.py --- grace_dl/torch/helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grace_dl/torch/helper.py b/grace_dl/torch/helper.py index e316c80..c2b170e 100644 --- a/grace_dl/torch/helper.py +++ b/grace_dl/torch/helper.py @@ -1,5 +1,5 @@ from horovod.torch.mpi_ops import global_process_set -def grace_from_params(params,processset=global_process_set.size()): +def grace_from_params(params,processset=global_process_set): world_size = processset.size() comp = params.get('compressor', 'none') mem = params.get('memory', 'none') From ef702e8113795060525c4d4603731fe6c8788b0f Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:15:19 -0400 Subject: [PATCH 31/44] Update allgather.py --- grace_dl/torch/communicator/allgather.py | 1 - 1 file changed, 1 deletion(-) diff --git a/grace_dl/torch/communicator/allgather.py b/grace_dl/torch/communicator/allgather.py index f355874..4bca0af 100644 --- a/grace_dl/torch/communicator/allgather.py +++ b/grace_dl/torch/communicator/allgather.py @@ -9,7 +9,6 @@ def __init__(self, compressor, memory, world_size): super().__init__(compressor, memory) self.world_size = world_size - def async_send(self, tensors_compressed, name,process_set): """ :param tensors_compressed: list of flat tensors to communicate From f74095c32a819c093de9c8217705d665016e9ff4 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:15:40 -0400 Subject: [PATCH 32/44] Delete grace_dl/dist/.DS_Store --- grace_dl/dist/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 grace_dl/dist/.DS_Store diff --git a/grace_dl/dist/.DS_Store b/grace_dl/dist/.DS_Store deleted file mode 100644 index 0e5427f8fe4b097ce86acb1b96e871829d3c7ce3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKyG{c^3>-s*BA`e~xxc_4tfKG*`~Z-oKnfhnqoBKv-)8JJ=-@(20ppcDcRtUq zZi@36fXxr9YhVUoN_WJYhq3u{_mQ1s=9Fl$#{tiH#1i+|kE)L+oO?+|hbOKfzr%L5 zTW%h@ZsXMREPBBfYwWnb6`Ya3rZbJ#A@lgb%1Qw#AO)m=6!@nKuxGnXFB&RJ0VyB_ zJ{9oqL!mp?#J*vCIv8REAWoPL<8{mu#Nr8JP3#*oL$gLDHmc=_VU5mw$-0`@H*9oR z4j+~$TTUnzr*r=j<*?dNQ3^_f xHNDps`W^klSR3UG(Ta)Dih1L$_-asB{F={eV&5?6%m4BVYgk From 9123421197652ca3564ba6aa16db507853700b60 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:15:51 -0400 Subject: [PATCH 33/44] Delete grace_dl/tensorflow/.DS_Store --- grace_dl/tensorflow/.DS_Store | Bin 8196 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 grace_dl/tensorflow/.DS_Store diff --git a/grace_dl/tensorflow/.DS_Store b/grace_dl/tensorflow/.DS_Store deleted file mode 100644 index 1cafe259310a11e1a521db1a2a204070823cadc2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHM&2G~`5T0#APp#=J^P3R;{hfRf(K);AYJiR9f|?PfMP%~pcqgL{0|J^Gn*By=D9Dds#Odq25uw+ z?Eau*WoSsu1oB%4HarDDY+yGpxQ}@NkI^L>5;K836vq_V1CgObrWizoqg`k2h=#;W zpa=&d!hy&m6Pck9esqk}xjPU;plTHZih)%ISnhrsGj<9AZu$J3LTpcy$o8p?K40y~ z_mZ^R{XsT18@INaT2tH6-kN80YNl2?Nd{K@n*Dk~sc$d#E&HYCPDky%Cp1hgFLYxb z;5jb3ym{q?4xJ9@ByUZ?s@zK1a&wAbC2OYhC@^C(HY5Vu? zKYBiT?*}3MAl5|WeF1h$Yy7xwfle~^ZJaQRsb#@G?U`d5L}K9GyX$-dIvh&IG&pyE z-Z}jKc)h=KDgWp!KbJ0>QtiCY%{cUFc#1xZoNcLc1Tnaf;C77^u3TJR-Aad|vY)r> zSjwx~{IpcHa?F7KQS@)V1vLpj!79frDY=DLs*4r%gNy5OjTRQQP>!9q1w&D>y>DfE zUa4PSmP!@I$SYLmQ&0w;;bDsNC@I41swf7o76Y4Nx(6)(e_H?j|7tBt%~mm>7`PMz zq%kyy0}NTbLzkmjuB~Hzhm{rPmkHz{*f1B?&(m?l Date: Mon, 22 Jul 2024 01:16:01 -0400 Subject: [PATCH 34/44] Delete grace_dl/tensorflow/memory/.DS_Store --- grace_dl/tensorflow/memory/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 grace_dl/tensorflow/memory/.DS_Store diff --git a/grace_dl/tensorflow/memory/.DS_Store b/grace_dl/tensorflow/memory/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Mon, 22 Jul 2024 01:16:16 -0400 Subject: [PATCH 35/44] Delete grace_dl/tensorflow/compressor/.DS_Store --- grace_dl/tensorflow/compressor/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 grace_dl/tensorflow/compressor/.DS_Store diff --git a/grace_dl/tensorflow/compressor/.DS_Store b/grace_dl/tensorflow/compressor/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Mon, 22 Jul 2024 01:16:26 -0400 Subject: [PATCH 36/44] Delete grace_dl/tensorflow/communicator/.DS_Store --- grace_dl/tensorflow/communicator/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 grace_dl/tensorflow/communicator/.DS_Store diff --git a/grace_dl/tensorflow/communicator/.DS_Store b/grace_dl/tensorflow/communicator/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Mon, 22 Jul 2024 01:18:58 -0400 Subject: [PATCH 37/44] Delete examples/.DS_Store --- examples/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 examples/.DS_Store diff --git a/examples/.DS_Store b/examples/.DS_Store deleted file mode 100644 index 4fd8dd2eaf137c68d1dd38c85df602c3ff83ce22..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}T>S5Z<+|O({YS3VK`cTCpiw5icRu7cim+m73UMgE3pu)Er77cYPsW#OHBl zcLOaJk0N#kcE8#A+0A^A{b7u8Z#r-ovl(L+G(?Wd3PJNq*MsH-rTKE1)=6)7r(}@?Pw>Y|zG7d{U z2(P1I-nX~UWs(I^GE7xMG#En2-A$AXWbVmnGMK1bUpuUZ)#%%Mv)Mt{X^BqPnYYBu zX`ggj;^_2vK5tk%`-f+j{pa{4k#Cww4wNg|GFZYpC<`UM`e_o&=NM(}`e zoeHQ^xp`u6oep+k;#`A;MxD;MS{ddsE0>QKu2u)TP~nWb8mT7+h=FAWDtc(+`F{bw z%*sdpate)z0b<~vF~F-mzvn?w=4|~|9-g%V+5 Date: Mon, 22 Jul 2024 01:21:10 -0400 Subject: [PATCH 38/44] Create pytorch_mnist_process_set.py --- examples/torch/pytorch_mnist_process_set.py | 176 ++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 examples/torch/pytorch_mnist_process_set.py diff --git a/examples/torch/pytorch_mnist_process_set.py b/examples/torch/pytorch_mnist_process_set.py new file mode 100644 index 0000000..00b71e5 --- /dev/null +++ b/examples/torch/pytorch_mnist_process_set.py @@ -0,0 +1,176 @@ +from __future__ import print_function + +import argparse + +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data.distributed +from torchvision import datasets, transforms + +import horovod.torch as hvd +from grace_dl.torch.communicator.allgather import Allgather +from grace_dl.torch.compressor.topk import TopKCompressor +from grace_dl.torch.memory.residual import ResidualMemory + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') +parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') +parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') +parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') +parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') +parser.add_argument('--seed', type=int, default=42, metavar='S', + help='random seed (default: 42)') +parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') +parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') +args = parser.parse_args() +args.cuda = not args.no_cuda and torch.cuda.is_available() + +# Horovod: initialize library. +hvd.init() +torch.manual_seed(args.seed) + +if args.cuda: + # Horovod: pin GPU to local rank. + torch.cuda.set_device(hvd.local_rank()) + torch.cuda.manual_seed(args.seed) + +# Horovod: limit # of CPU threads to be used per worker. +torch.set_num_threads(1) + +kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} +train_dataset = \ + datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) +# Horovod: use DistributedSampler to partition the training data. +train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) +train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) + +test_dataset = \ + datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) +# Horovod: use DistributedSampler to partition the test data. +test_sampler = torch.utils.data.distributed.DistributedSampler( + test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) +test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, + sampler=test_sampler, **kwargs) + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x) + + +model = Net() + +if args.cuda: + # Move model to GPU. + model.cuda() + +# Horovod: scale learning rate by the number of GPUs. +optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), + momentum=args.momentum) + +# Horovod: broadcast parameters & optimizer state. +hvd.broadcast_parameters(model.state_dict(), root_rank=0) +hvd.broadcast_optimizer_state(optimizer, root_rank=0) + +# GRACE: compression algorithm. +# grc = Allgather(TopKCompressor(0.3), ResidualMemory(), hvd.size()) + +from grace_dl.torch.helper import grace_from_params +params = {'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather'} +grc = grace_from_params(params) + +# Horovod: wrap optimizer with DistributedOptimizer. +optimizer = hvd.DistributedOptimizer(optimizer, grc, named_parameters=model.named_parameters()) + + +def train(epoch): + model.train() + # Horovod: set epoch to sampler for shuffling. + train_sampler.set_epoch(epoch) + for batch_idx, (data, target) in enumerate(train_loader): + if args.cuda: + data, target = data.cuda(), target.cuda() + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + # Horovod: use train_sampler to determine the number of examples in + # this worker's partition. + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) + + +def metric_average(val, name): + tensor = torch.tensor(val) + avg_tensor = hvd.allreduce(tensor, name=name) + return avg_tensor.item() + + +def test(): + model.eval() + test_loss = 0. + test_accuracy = 0. + for data, target in test_loader: + if args.cuda: + data, target = data.cuda(), target.cuda() + output = model(data) + # sum up batch loss + test_loss += F.nll_loss(output, target, size_average=False).item() + # get the index of the max log-probability + pred = output.data.max(1, keepdim=True)[1] + test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() + + # Horovod: use test_sampler to determine the number of examples in + # this worker's partition. + test_loss /= len(test_sampler) + test_accuracy /= len(test_sampler) + + # Horovod: average metric values across workers. + test_loss = metric_average(test_loss, 'avg_loss') + test_accuracy = metric_average(test_accuracy, 'avg_accuracy') + + # Horovod: print output only on first rank. + if hvd.rank() == 0: + print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( + test_loss, 100. * test_accuracy)) + + +for epoch in range(1, args.epochs + 1): + train(epoch) + test() From 8974e5e40643f130adf2b67ea42bb8ebd747cc8c Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:34:06 -0400 Subject: [PATCH 39/44] Update pytorch_mnist_process_set.py --- examples/torch/pytorch_mnist_process_set.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/examples/torch/pytorch_mnist_process_set.py b/examples/torch/pytorch_mnist_process_set.py index 00b71e5..8221502 100644 --- a/examples/torch/pytorch_mnist_process_set.py +++ b/examples/torch/pytorch_mnist_process_set.py @@ -38,6 +38,9 @@ # Horovod: initialize library. hvd.init() +even_set = hvd.ProcessSet([0,2]) +odd_set = hvd.ProcessSet([1,3]) +hvd.init(process_sets=[even_set, odd_set]) torch.manual_seed(args.seed) if args.cuda: @@ -114,7 +117,10 @@ def forward(self, x): grc = grace_from_params(params) # Horovod: wrap optimizer with DistributedOptimizer. -optimizer = hvd.DistributedOptimizer(optimizer, grc, named_parameters=model.named_parameters()) +if(hvd.ProcessSet.included(even_set)): + optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(),grace=grc,process_set=even_set) +else: + optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(),grace=grc,process_set=odd_set) def train(epoch): @@ -138,7 +144,10 @@ def train(epoch): def metric_average(val, name): tensor = torch.tensor(val) - avg_tensor = hvd.allreduce(tensor, name=name) + if(hvd.ProcessSet.included(even_set)): + avg_tensor = hvd.allreduce(tensor, name=name,process_set=even_set) + else: + avg_tensor = hvd.allreduce(tensor, name=name,process_set=odd_set) return avg_tensor.item() From 4213c39b2081412e3b8c24b1238352e14d6aa163 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:53:30 -0400 Subject: [PATCH 40/44] Update pytorch_mnist_process_set.py --- examples/torch/pytorch_mnist_process_set.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/torch/pytorch_mnist_process_set.py b/examples/torch/pytorch_mnist_process_set.py index 8221502..caa7798 100644 --- a/examples/torch/pytorch_mnist_process_set.py +++ b/examples/torch/pytorch_mnist_process_set.py @@ -114,12 +114,14 @@ def forward(self, x): from grace_dl.torch.helper import grace_from_params params = {'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather'} -grc = grace_from_params(params) + # Horovod: wrap optimizer with DistributedOptimizer. if(hvd.ProcessSet.included(even_set)): + grc = grace_from_params(params,even_set) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(),grace=grc,process_set=even_set) else: + grc = grace_from_params(params,odd_set) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(),grace=grc,process_set=odd_set) @@ -175,7 +177,10 @@ def test(): test_accuracy = metric_average(test_accuracy, 'avg_accuracy') # Horovod: print output only on first rank. - if hvd.rank() == 0: + if(hvd.ProcessSet.included(even_set) and hvd.ProcessSet.rank(even_set)==0): + print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( + test_loss, 100. * test_accuracy)) + elif(hvd.ProcessSet.included(odd_set) and hvd.ProcessSet.rank(odd_set)==0): print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) From fe7867e9ca17d3a258d5c1b158138ec56fd89a7c Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Mon, 22 Jul 2024 01:55:21 -0400 Subject: [PATCH 41/44] Update topk.py --- grace_dl/torch/compressor/topk.py | 1 - 1 file changed, 1 deletion(-) diff --git a/grace_dl/torch/compressor/topk.py b/grace_dl/torch/compressor/topk.py index 381d4ec..6d2a750 100644 --- a/grace_dl/torch/compressor/topk.py +++ b/grace_dl/torch/compressor/topk.py @@ -13,7 +13,6 @@ def sparsify(tensor, compress_ratio): def desparsify(tensors, numel): values, indices = tensors - # print("values, indices",values, indices,numel) tensor_decompressed = torch.zeros(numel, dtype=values.dtype, layout=values.layout, device=values.device) tensor_decompressed.scatter_(0, indices, values) return tensor_decompressed From eb821621ef610ee39491996f89c6928c15820b15 Mon Sep 17 00:00:00 2001 From: Garvit Banga Date: Tue, 23 Jul 2024 17:54:00 -0400 Subject: [PATCH 42/44] backward pass reset count --- .DS_Store | Bin 0 -> 8196 bytes patch_files/horovod/.DS_Store | Bin 0 -> 6148 bytes patch_files/horovod/torch/optimizer.py | 1 + 3 files changed, 1 insertion(+) create mode 100644 .DS_Store create mode 100644 patch_files/horovod/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..c8ee896d5f1aa3e1ba77e68893026deccc0aa22f GIT binary patch literal 8196 zcmeHMO^?z*7=DLEcL>pNaNXTo6AxZ67KGgxFBS0*Sfd9u*iwYX(jgRN6GFmW@BS9= zW)p9oysIbu4IX^w1L+4{jR$qznPlcEo#&m-JP)*e2LK?gsaFHo0|1JZW2KDEhQj-K z&6I*Vvx7)59$*9k^dNF)(I~^#C$s`u0j+>mKr5gX_!ktwXErZ1=eaL?b*mN73YLHOaxFyN58y&-ZLc;h%!|5#ixcJC%&LQa&F-?H+_@m1r<*uK)^00hXmfEK?d~7n z-puoUi8mU=u34~fsO-F6*yAu@VITYWVkz`tjBa(&wGMa?@ZFZb=~m-L6WwZOx&%vA4l&luiQ$FHDqhIPU0opx+=aX88=dHmGL*_zy#eI zpj*$tU+<9p51!>LFYa6pCE*|W^d9}oVa8qj`mOinqI5Brz4r3w?1ZBQ@$Qf_Zpv4j zt}a@EQ?0<3oc$ip|L^AS|DWnL>d9*bv;rq9K#Fa<-9palT{=zYxwe7zDOO%MZz)lP rU?b9TM3Igo{`kWXj}4^q7)#+&BA&tW$2S5J?_v7+FHF$QnN#39JjP#X literal 0 HcmV?d00001 diff --git a/patch_files/horovod/.DS_Store b/patch_files/horovod/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..28d2290514514ddbf1620ed059e6e1e158624699 GIT binary patch literal 6148 zcmeHK%}(1u5S|SQY(TAYC=y6Ltki2HoD|dp7n6n;z^ELcO6@oh3&$J9PJM_X`QD!U zF1!Gbz&mi@9hm(IN-(Ims%mGX*>87tW`Dl5c0B;VI@7QP&;S6As<2YW;ssItN=s6) zmQ6&a$M7Ky(|#OC8JE1t@HaA`&Tbuga18^Pz_atKy=>lu480RbL1N4$#EALwx!2G^ zb@gEwfCE&kOT%G}HRY?^lb$+>Es;^z}A>VEJg z8sxpkyU#Mo{3sbrbU+ji(BnDs(Lt2KCi}g-!twt7z7Ob(Kq~7-7-1SQx|}6lOvZO(?S|1~cJk z7uL_USQs?nz-;ot?99w&C``|e;|uK$%r!_QF+dEgGO%Wv4fXyXzn=fE22qO`AO`*^ z23WQ0bvw8v`?i)gr`}qD`VCcy@+%BJg Date: Tue, 23 Jul 2024 17:54:37 -0400 Subject: [PATCH 43/44] Delete .DS_Store --- .DS_Store | Bin 8196 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index c8ee896d5f1aa3e1ba77e68893026deccc0aa22f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMO^?z*7=DLEcL>pNaNXTo6AxZ67KGgxFBS0*Sfd9u*iwYX(jgRN6GFmW@BS9= zW)p9oysIbu4IX^w1L+4{jR$qznPlcEo#&m-JP)*e2LK?gsaFHo0|1JZW2KDEhQj-K z&6I*Vvx7)59$*9k^dNF)(I~^#C$s`u0j+>mKr5gX_!ktwXErZ1=eaL?b*mN73YLHOaxFyN58y&-ZLc;h%!|5#ixcJC%&LQa&F-?H+_@m1r<*uK)^00hXmfEK?d~7n z-puoUi8mU=u34~fsO-F6*yAu@VITYWVkz`tjBa(&wGMa?@ZFZb=~m-L6WwZOx&%vA4l&luiQ$FHDqhIPU0opx+=aX88=dHmGL*_zy#eI zpj*$tU+<9p51!>LFYa6pCE*|W^d9}oVa8qj`mOinqI5Brz4r3w?1ZBQ@$Qf_Zpv4j zt}a@EQ?0<3oc$ip|L^AS|DWnL>d9*bv;rq9K#Fa<-9palT{=zYxwe7zDOO%MZz)lP rU?b9TM3Igo{`kWXj}4^q7)#+&BA&tW$2S5J?_v7+FHF$QnN#39JjP#X From 7029f8ac023288406900abdd30dd9a43c289d053 Mon Sep 17 00:00:00 2001 From: Garvit Banga <37771583+GarvitBanga@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:54:48 -0400 Subject: [PATCH 44/44] Delete patch_files/horovod/.DS_Store --- patch_files/horovod/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 patch_files/horovod/.DS_Store diff --git a/patch_files/horovod/.DS_Store b/patch_files/horovod/.DS_Store deleted file mode 100644 index 28d2290514514ddbf1620ed059e6e1e158624699..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}(1u5S|SQY(TAYC=y6Ltki2HoD|dp7n6n;z^ELcO6@oh3&$J9PJM_X`QD!U zF1!Gbz&mi@9hm(IN-(Ims%mGX*>87tW`Dl5c0B;VI@7QP&;S6As<2YW;ssItN=s6) zmQ6&a$M7Ky(|#OC8JE1t@HaA`&Tbuga18^Pz_atKy=>lu480RbL1N4$#EALwx!2G^ zb@gEwfCE&kOT%G}HRY?^lb$+>Es;^z}A>VEJg z8sxpkyU#Mo{3sbrbU+ji(BnDs(Lt2KCi}g-!twt7z7Ob(Kq~7-7-1SQx|}6lOvZO(?S|1~cJk z7uL_USQs?nz-;ot?99w&C``|e;|uK$%r!_QF+dEgGO%Wv4fXyXzn=fE22qO`AO`*^ z23WQ0bvw8v`?i)gr`}qD`VCcy@+%BJg