From 171f8ec7c07716bbe09a448bd2c0d8cb4816fc3d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 27 Oct 2020 15:09:25 -0700 Subject: [PATCH 01/28] Updated LibTorch to 1.7.0 --- .travis.yml | 2 +- codegen/generate_functions.rb | 10 +- codegen/native_functions.yaml | 3077 +++++++++++++++++++++++++-------- ext/torch/ext.cpp | 4 +- ext/torch/ruby_arg_parser.h | 28 +- ext/torch/templates.h | 1 + ext/torch/wrap_outputs.h | 7 + 7 files changed, 2409 insertions(+), 720 deletions(-) diff --git a/.travis.yml b/.travis.yml index a2c40c9f..160cb095 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,7 @@ jobs: - rvm: 2.5 dist: xenial before_install: - - export LIBTORCH_VERSION=1.6.0 + - export LIBTORCH_VERSION=1.7.0 - ./test/ci/install_libtorch.sh cache: bundler: true diff --git a/codegen/generate_functions.rb b/codegen/generate_functions.rb index 16e7c075..8d18df89 100644 --- a/codegen/generate_functions.rb +++ b/codegen/generate_functions.rb @@ -328,6 +328,8 @@ def generate_function_params(function, params, remove_self) "tensorlist" when /\Aint\[/ "intlist" + when "float[]" + "doublelist" when "Scalar" "scalar" when "bool" @@ -419,6 +421,8 @@ def generate_dispatch_params(function, params) "double" when /\Aint\[/ "IntArrayRef" + when "float[]" + "ArrayRef" when "str" "std::string" when "Scalar", "bool", "ScalarType", "Layout", "Device", "Storage", "Generator", "MemoryFormat", "Storage" @@ -466,7 +470,9 @@ def generate_dispatch_retval(function) when ["Tensor", "Tensor", "Tensor", "Tensor", "Tensor"] "std::tuple" when ["Tensor", "Tensor", "float", "int"] - "std::tuple" + "std::tuple" + when ["float", "float"] + "std::tuple" else raise "Unknown retvals: #{types}" end @@ -539,6 +545,8 @@ def signature_type(param) "std::string" when "Scalar", "Dimname", "bool", "ScalarType", "Layout", "Device", "Generator", "MemoryFormat", "Storage" param[:type] + when "float[]" + "ArrayRef" else raise "Unknown type: #{param[:type]}" end diff --git a/codegen/native_functions.yaml b/codegen/native_functions.yaml index 859c8773..4d748250 100644 --- a/codegen/native_functions.yaml +++ b/codegen/native_functions.yaml @@ -47,6 +47,7 @@ # Computes the gradient of current tensor w.r.t. graph leaves. - func: backward(Tensor self, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> () + use_c10_dispatcher: full manual_kernel_registration: True variants: method @@ -94,6 +95,7 @@ variants: method - func: requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!) + use_c10_dispatcher: full manual_kernel_registration: True variants: method @@ -125,12 +127,6 @@ - func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a) variants: method -- func: unflatten.Dimname(Tensor self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor - variants: method - -- func: unflatten.int(Tensor self, int dim, int[] sizes, Dimname[] names) -> Tensor - variants: method - - func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool use_c10_dispatcher: full dispatch: @@ -150,14 +146,17 @@ CUDA: _cudnn_rnn_flatten_weight - func: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: _cudnn_rnn - func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[]) + use_c10_dispatcher: full dispatch: CUDA: _cudnn_rnn_backward -- func: _cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor +- func: _cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + use_c10_dispatcher: full dispatch: CUDA: _cudnn_init_dropout_state @@ -168,21 +167,25 @@ - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) variants: function dispatch: - CUDA: fused_dropout_cuda + CUDA: fused_dropout_cuda - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor use_c10_dispatcher: full variants: function dispatch: - CUDA: masked_scale_cuda + CUDA: masked_scale_cuda - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor) + use_c10_dispatcher: full - func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!) + use_c10_dispatcher: full - func: _sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!) + use_c10_dispatcher: full - func: _sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!) + use_c10_dispatcher: full - func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor use_c10_dispatcher: full @@ -194,54 +197,81 @@ use_c10_dispatcher: full - func: dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + use_c10_dispatcher: full - func: feature_dropout(Tensor input, float p, bool train) -> Tensor use_c10_dispatcher: full - func: feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + use_c10_dispatcher: full - func: alpha_dropout(Tensor input, float p, bool train) -> Tensor use_c10_dispatcher: full - func: alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + use_c10_dispatcher: full - func: feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor use_c10_dispatcher: full - func: feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) + use_c10_dispatcher: full - func: abs(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: abs_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: abs_out + +# Note [Adding an alias] +# To add an alias do the following: +# +# 1) Copy the original functions native_functions.yaml entry, but replace the +# original function's name with their own and delete any dispatch +# keys for the aliases. Specifying a dispatch key will prevent +# autograd from recording the operations the alias performs, which +# will stop it from "inheriting" the original operation's autograd behavior. +# 2) Implement the corresponding functions and have them redispatch to the +# original function. +# 3) Add entries for the alias (and original function, if needed) to +# aten/src/ATen/core/interned_strings.h +# (This may require removing an entry from ATen/core/aten_interned_strings.h.) +# 4) Add docstrings to the new function that reference the original function, +# and document the method as usual (if it exists.) +# (See torch/_torch_docs.py and docs/source/torch.rst if adding a function, +# torch/_tensor_docs.py and docs/source/tensors.rst if adding a method, +# or module-specific doc bindings (like torch/linalg/__init__.py) if +# adding an alias in a namespace.) +# 5) Update torch/overrides.py consistent with the original function. +# 6) Update the alias_map in torch/csrc/jit/passes/normalize_ops.cpp. +# 7) Add entries to test/test_op_aliases.py's "alias_infos" +# +# See torch.absolute, an alias for torch.abs, as an example. +# Absolute, alias for abs - func: absolute(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - dispatch: - CPU: abs - CUDA: abs - func: absolute_(Tensor(a!) self) -> Tensor(a!) - variants: function, method - dispatch: - CPU: abs_ - CUDA: abs_ + use_c10_dispatcher: full + variants: method - func: absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU: abs_out - CUDA: abs_out - func: angle(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: angle_out - func: view_as_real(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full @@ -251,6 +281,17 @@ use_c10_dispatcher: full variants: function +- func: sgn(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: sgn_(Tensor(a!) self) -> Tensor(a!) + variants: method + +- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sgn_out + - func: real(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full variants: function @@ -264,15 +305,35 @@ variants: function, method - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: conj_out + +- func: _conj(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function - func: acos(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: acos_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: acos_out + +# arccos, alias of acos +- func: arccos(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: arccos_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + +- func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor use_c10_dispatcher: full @@ -288,48 +349,69 @@ use_c10_dispatcher: full variants: function, method dispatch: - CPU: add - CUDA: add - SparseCPU: add_sparse - SparseCUDA: add_sparse + CPU, CUDA: add + SparseCPU, SparseCUDA: add_sparse MkldnnCPU: mkldnn_add - Vulkan: vulkan_add - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: add_ - CUDA: add_ - SparseCPU: add_sparse_ - SparseCUDA: add_sparse_ + CPU, CUDA: add_ + SparseCPU, SparseCUDA: add_sparse_ MkldnnCPU: mkldnn_add_ - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: add_out - CUDA: add_out + CPU, CUDA: add_out SparseCPU: add_out_sparse_cpu SparseCUDA: add_out_sparse_cuda MkldnnCPU: mkldnn_add_out +- func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + use_c10_dispatcher: full + variants: function + dispatch: + CPU: add_relu + +- func: _add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full + variants: function + dispatch: + CPU: add_relu_ + +- func: _add_relu.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CPU: add_relu_out + # For C++ only, until we have conversion from C++ numbers to Tensor - func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor use_c10_dispatcher: full variants: function, method - func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: addmv - func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: addmv_ - func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addmv_out - func: _addmv_impl_(Tensor(a!) self, Tensor self2, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full dispatch: CPU: addmv_impl_cpu CUDA: addmv_impl_cuda @@ -339,6 +421,7 @@ variants: function, method - func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) @@ -354,8 +437,12 @@ - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: all - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: all_out - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor variants: function, method @@ -369,8 +456,12 @@ - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: any - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: any_out - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor variants: function, method @@ -378,10 +469,13 @@ - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) - func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!) @@ -402,60 +496,89 @@ use_c10_dispatcher: full variants: function, method dispatch: - CPU: argmax - CUDA: argmax + CPU, CUDA: argmax - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method dispatch: - CPU: argmin - CUDA: argmin + CPU, CUDA: argmin - func: acosh(Tensor self) -> Tensor use_c10_dispatcher: full - supports_named_tensor: True variants: function, method - func: acosh_(Tensor(a!) self) -> Tensor(a!) - supports_named_tensor: True + use_c10_dispatcher: full variants: function, method - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - supports_named_tensor: True + dispatch: + CPU, CUDA: acosh_out + +# arccosh, alias for acosh +- func: arccosh(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: arccosh_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + +- func: arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - func: asinh(Tensor self) -> Tensor use_c10_dispatcher: full - supports_named_tensor: True variants: function, method - func: asinh_(Tensor(a!) self) -> Tensor(a!) - supports_named_tensor: True + use_c10_dispatcher: full variants: function, method - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - supports_named_tensor: True + dispatch: + CPU, CUDA: asinh_out + +# arcsinh, alias for asinh +- func: arcsinh(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: arcsinh_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + +- func: arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - func: atanh(Tensor self) -> Tensor use_c10_dispatcher: full - supports_named_tensor: True variants: function, method - func: atanh_(Tensor(a!) self) -> Tensor(a!) - supports_named_tensor: True + use_c10_dispatcher: full variants: function, method - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - supports_named_tensor: True + dispatch: + CPU, CUDA: atanh_out + +# arctanh, alias for atanh +- func: arctanh(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: arctanh_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + +- func: arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - func: as_strided(Tensor(a) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a) use_c10_dispatcher: full variants: function, method dispatch: - CPU: as_strided_tensorimpl - CUDA: as_strided_tensorimpl - QuantizedCPU: as_strided_qtensorimpl - QuantizedCUDA: as_strided_qtensorimpl + CPU, CUDA: as_strided_tensorimpl + QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl device_guard: False - func: as_strided_(Tensor(a!) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a!) @@ -467,18 +590,73 @@ variants: function, method - func: asin_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: asin_ + SparseCPU, SparseCUDA: asin_sparse_ - func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: asin_out + SparseCPU, SparseCUDA: asin_out_sparse + +# arcsin, alias of asin +- func: arcsin(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: arcsin_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + +- func: arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - func: atan(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: atan_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: atan_out + +# arctan, alias of atan +- func: arctan(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: arctan_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + +- func: arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + +- func: atleast_1d(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function + +- func: atleast_1d.Sequence(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full + +- func: atleast_2d(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function + +- func: atleast_2d.Sequence(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full + variants: function + +- func: atleast_3d(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function + +- func: atleast_3d.Sequence(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full + variants: function - func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor use_c10_dispatcher: full @@ -488,12 +666,14 @@ CUDA: baddbmm_cuda - func: baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: baddbmm__cpu CUDA: baddbmm__cuda - func: _baddbmm_mkl_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full variants: function - func: baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) @@ -503,19 +683,24 @@ CUDA: baddbmm_out_cuda - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor + use_c10_dispatcher: full - func: quantized_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor - requires_tensor: True + use_c10_dispatcher: full dispatch: QuantizedCPU: quantized_batch_norm - func: _batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int) + use_c10_dispatcher: full - func: _batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full # Sample bernoulli with values in `self` as probability. - func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor @@ -523,12 +708,18 @@ - func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) variants: function + dispatch: + CPU, CUDA: bernoulli_out - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: bernoulli_ - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: bernoulli_ # This out-of-place version isn't used explicitly, but needed by jit. # There is no default valid on `p` here because it would introduce ambiguity @@ -537,8 +728,10 @@ variants: function, method - func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor + use_c10_dispatcher: full - func: binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor + use_c10_dispatcher: full python_module: nn variants: function dispatch: @@ -553,6 +746,7 @@ CUDA: binary_cross_entropy_out_cuda - func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor + use_c10_dispatcher: full python_module: nn variants: function dispatch: @@ -567,12 +761,15 @@ CUDA: binary_cross_entropy_backward_out_cuda - func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor + use_c10_dispatcher: full variants: function - func: binary_cross_entropy_with_logits_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor + use_c10_dispatcher: full variants: function - func: bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor + use_c10_dispatcher: full variants: function, method dispatch: CPU: _bincount_cpu @@ -583,64 +780,66 @@ variants: function, method - func: bitwise_not_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: bitwise_not_out - CUDA: bitwise_not_out + CPU, CUDA: bitwise_not_out - func: logical_not(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: logical_not_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: logical_not_out - CUDA: logical_not_out + CPU, CUDA: logical_not_out - func: logical_xor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function, method - func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: logical_xor_out - CUDA: logical_xor_out + CPU, CUDA: logical_xor_out - func: logical_and(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function, method - func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: logical_and_out - CUDA: logical_and_out + CPU, CUDA: logical_and_out - func: logical_or(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function, method - func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: logical_or_out - CUDA: logical_or_out + CPU, CUDA: logical_or_out - func: blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: bmm(Tensor self, Tensor mat2) -> Tensor use_c10_dispatcher: full @@ -692,17 +891,22 @@ variants: function, method - func: ceil_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: ceil_out - CUDA: ceil_out + CPU, CUDA: ceil_out - func: chain_matmul(Tensor[] matrices) -> Tensor use_c10_dispatcher: full variants: function +- func: unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[] + use_c10_dispatcher: full + variants: function, method + device_guard: False + - func: chunk(Tensor(a) self, int chunks, int dim=0) -> Tensor(a)[] use_c10_dispatcher: full variants: function, method @@ -712,63 +916,108 @@ use_c10_dispatcher: full variants: function, method dispatch: - CPU: clamp - CUDA: clamp - QuantizedCPU: quantized_clamp - Vulkan: vulkan_clamp + CPU, CUDA: clamp + QuantizedCPU: clamp_quantized_cpu - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_out - func: clamp_max(Tensor self, Scalar max) -> Tensor use_c10_dispatcher: full variants: function, method - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_max_out - func: clamp_min(Tensor self, Scalar min) -> Tensor use_c10_dispatcher: full variants: function, method - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_min_out + +# clip is an alias for clamp +- func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) + variants: function, method + +- func: clip.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) - func: cudnn_is_acceptable(Tensor self) -> bool use_c10_dispatcher: full device_guard: False +- func: complex(Tensor real, Tensor imag) -> Tensor + use_c10_dispatcher: full + variants: function + +- func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: complex_out + +- func: polar(Tensor abs, Tensor angle) -> Tensor + use_c10_dispatcher: full + variants: function + +- func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: polar_out + - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor use_c10_dispatcher: full variants: function -- func: contiguous(Tensor self, *, MemoryFormat memory_format=contiguous_format) -> Tensor +- func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a) + use_c10_dispatcher: full variants: method - func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor + use_c10_dispatcher: full - func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor + use_c10_dispatcher: full - func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) use_c10_dispatcher: full -- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor +- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor + use_c10_dispatcher: full + +- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor + use_c10_dispatcher: full - func: _convolution_nogroup(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding) -> Tensor + use_c10_dispatcher: full -- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full - func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor + use_c10_dispatcher: full - func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor + use_c10_dispatcher: full - func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor + use_c10_dispatcher: full - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor use_c10_dispatcher: full @@ -778,13 +1027,16 @@ # NB: we inherit the goofy argument order from PyTorch torch.nn.functional - func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor + use_c10_dispatcher: full - func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor + use_c10_dispatcher: full - func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor + use_c10_dispatcher: full - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) - manual_kernel_registration: True + use_c10_dispatcher: full variants: method device_guard: False @@ -797,22 +1049,38 @@ variants: function, method - func: cos_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cos_out - func: cosh(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: cosh_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cosh_out - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor use_c10_dispatcher: full +- func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor + use_c10_dispatcher: full + variants: function, method + dispatch: + CPU, CUDA: count_nonzero + +- func: count_nonzero(Tensor self, int? dim=None) -> Tensor + use_c10_dispatcher: full + variants: function, method + - func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid use_c10_dispatcher: full dispatch: @@ -825,60 +1093,74 @@ CUDA: cudnn_affine_grid_generator_backward - func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: cudnn_batch_norm # NB: You can only use this if you used cudnn_batch_norm training=True - func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: cudnn_batch_norm_backward - func: cudnn_convolution.deprecated(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_deprecated -- func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: cudnn_convolution.deprecated2(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + use_c10_dispatcher: full + dispatch: + CUDA: cudnn_convolution_deprecated2 + +- func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution -- func: cudnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: cudnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_backward_input -- func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[2] output_mask) -> (Tensor, Tensor) +- func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor) use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_backward -- func: cudnn_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: cudnn_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_backward_weight - func: cudnn_convolution_transpose.deprecated(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_deprecated -- func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: cudnn_convolution_transpose.deprecated2(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + use_c10_dispatcher: full + dispatch: + CUDA: cudnn_convolution_transpose_deprecated2 + +- func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose # NB: output_padding not strictly needed here, but it's helpful for the float # backwards -- func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[2] output_mask) -> (Tensor, Tensor) +- func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor) use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_backward -- func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_backward_input -- func: cudnn_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: cudnn_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_backward_weight @@ -928,7 +1210,13 @@ CPU: cummin_helper_cpu CUDA: cummin_helper_cuda +- func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False + - func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) @@ -938,7 +1226,13 @@ - func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +- func: cumprod_backward(Tensor grad, Tensor input, int dim) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False + - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) @@ -958,7 +1252,7 @@ - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) use_c10_dispatcher: full dispatch: - CPU: ctc_loss_cpu + CPU: ctc_loss_cpu CUDA: ctc_loss_gpu - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor @@ -967,10 +1261,6 @@ CPU: ctc_loss_backward_cpu CUDA: ctc_loss_backward_gpu -- func: det(Tensor self) -> Tensor - use_c10_dispatcher: full - variants: function, method - - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor use_c10_dispatcher: full variants: function, method @@ -986,32 +1276,33 @@ - func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a) variants: function, method +- func: diagonal_backward(Tensor grad, int[] input_sizes, int offset, int dim1, int dim2) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False + - func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: div.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function, method dispatch: - CPU: div - CUDA: div - SparseCPU: div_sparse - SparseCUDA: div_sparse + CPU, CUDA: div + SparseCPU, SparseCUDA: div_sparse - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: div_ - CUDA: div_ - SparseCPU: div_sparse_ - SparseCUDA: div_sparse_ + CPU, CUDA: div_ + SparseCPU, SparseCUDA: div_sparse_ - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: div_out - CUDA: div_out - SparseCPU: div_out_sparse_zerodim - SparseCUDA: div_out_sparse_zerodim + CPU, CUDA: div_out + SparseCPU, SparseCUDA: div_out_sparse_zerodim # For C++ only, until we have conversion from C++ numbers to Tensor - func: div.Scalar(Tensor self, Scalar other) -> Tensor @@ -1019,24 +1310,72 @@ variants: function, method - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method -- func: dot(Tensor self, Tensor tensor) -> Tensor +# divide, alias for div +- func: divide.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function, method - dispatch: - CPU: legacy::cpu::_th_dot - CUDA: legacy::cuda::_th_dot - -- func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!) -- func: einsum(str equation, Tensor[] tensors) -> Tensor +- func: divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) use_c10_dispatcher: full + variants: method -- func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor - use_c10_dispatcher: full +- func: divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) -- func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor +- func: divide.Scalar(Tensor self, Scalar other) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + + # true_divide, an alias for div +- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: dot(Tensor self, Tensor tensor) -> Tensor + use_c10_dispatcher: full + variants: function, method + dispatch: + CPU: dot + CUDA: dot_cuda + +- func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!) + +- func: vdot(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: function, method + dispatch: + CPU: vdot + CUDA: vdot_cuda + +- func: vdot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: einsum(str equation, Tensor[] tensors) -> Tensor + use_c10_dispatcher: full + +- func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor + use_c10_dispatcher: full + +- func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor use_c10_dispatcher: full - func: embedding_dense_backward(Tensor grad_output, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor @@ -1046,6 +1385,7 @@ CUDA: embedding_dense_backward_cuda - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!) + use_c10_dispatcher: full dispatch: CPU: embedding_renorm_cpu_ CUDA: embedding_renorm_cuda_ @@ -1062,18 +1402,30 @@ # applying indices = indices.contiguous(). # The backward functions apply a check that these input tensors are contiguous. + +- func: _embedding_bag_forward_only(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor) + use_c10_dispatcher: full + dispatch: + CPU: _embedding_bag_forward_only_cpu + CUDA: _embedding_bag_forward_only_cuda + - func: embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor) + use_c10_dispatcher: full - func: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CPU: _embedding_bag_cpu CUDA: _embedding_bag_cuda - func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, int num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights) -> Tensor + use_c10_dispatcher: full - func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, int num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights) -> Tensor + use_c10_dispatcher: full - func: _embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, int num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights) -> Tensor + use_c10_dispatcher: full dispatch: CPU: _embedding_bag_dense_backward_cpu CUDA: _embedding_bag_dense_backward_cuda @@ -1085,118 +1437,150 @@ CUDA: _embedding_bag_per_sample_weights_backward_cuda - func: empty_meta(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + #use_c10_dispatcher: full - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor device_guard: False - func: empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + #use_c10_dispatcher: full dispatch: CPU: empty_cpu CUDA: empty_cuda MkldnnCPU: empty_mkldnn - SparseCPU: empty_sparse - SparseCUDA: empty_sparse - Vulkan: empty_vulkan + SparseCPU, SparseCUDA: empty_sparse - func: new_empty(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + #use_c10_dispatcher: full variants: method - func: new_full(Tensor self, int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full variants: method - func: new_zeros(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full variants: method # other overrides are to provide a more helpful error message that dtype is required - func: _empty_affine_quantized(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor + use_c10_dispatcher: full dispatch: CPU: empty_affine_quantized_other_backends_stub - QuantizedCPU: empty_affine_quantized - QuantizedCUDA: empty_affine_quantized + QuantizedCPU, QuantizedCUDA: empty_affine_quantized # it's a factory function receiving a tensor argument, thus overriding explicitly # other overrides are to provide a more helpful error message that dtype is required - func: _empty_per_channel_affine_quantized(int[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor + use_c10_dispatcher: full category_override: factory dispatch: CPU: empty_per_channel_affine_quantized_other_backends_stub - QuantizedCPU: empty_per_channel_affine_quantized_cpu + QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized - func: resize_(Tensor(a!) self, int[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!) - manual_kernel_registration: True + use_c10_dispatcher: full variants: method device_guard: False + dispatch: + CPU: resize_ + CUDA: resize_cuda_ + QuantizedCPU: quantized_resize_cpu_ - func: empty_quantized(int[] size, Tensor qtensor) -> Tensor + use_c10_dispatcher: full variants: function dispatch: - QuantizedCPU: empty_quantized - QuantizedCUDA: empty_quantized + QuantizedCPU, QuantizedCUDA: empty_quantized - func: empty.out(int[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) device_guard: False - func: empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full device_guard: False - func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full dispatch: CPU: empty_strided_cpu CUDA: empty_strided_cuda - Vulkan: empty_strided_vulkan - func: erf(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: erf_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: erf_out - func: erfc(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: erfc_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: erfc_out - func: exp(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: exp_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: exp_out + +- func: exp2(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: exp2_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + +- func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: exp2_out - func: expm1(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: expm1_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: expm1_out - CUDA: expm1_out + CPU, CUDA: expm1_out - func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a) use_c10_dispatcher: full variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. device_guard: False -- func: expand_as(Tensor self, Tensor other) -> Tensor +- func: expand_as(Tensor(a) self, Tensor other) -> Tensor(a) use_c10_dispatcher: full variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. device_guard: False - func: eye(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: eye.m(int n, int m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: eye.out(int n, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -1208,23 +1592,31 @@ CPU: eye_out_cpu CUDA: eye_out_cuda -- func: flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> Tensor +- func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a) use_c10_dispatcher: full variants: function, method -- func: flatten.named_out_dim(Tensor self, int start_dim, int end_dim, Dimname out_dim) -> Tensor +- func: flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a) variants: function, method -- func: flatten.using_names(Tensor self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor +- func: flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a) variants: function, method -- func: flatten.DimnameList(Tensor self, Dimname[] dims, Dimname out_dim) -> Tensor +- func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a) variants: function, method +- func: unflatten.int(Tensor(a) self, int dim, int[] sizes, Dimname[]? names=None) -> Tensor(a) + variants: method + +- func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a) + variants: method + - func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: floor(Tensor self) -> Tensor @@ -1232,42 +1624,38 @@ variants: function, method - func: floor_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: floor_out - CUDA: floor_out + CPU, CUDA: floor_out - func: floor_divide(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function, method dispatch: - CPU: floor_divide - CUDA: floor_divide - SparseCPU: floor_divide_sparse - SparseCUDA: floor_divide_sparse + CPU, CUDA: floor_divide + SparseCPU, SparseCUDA: floor_divide_sparse - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: floor_divide_ - CUDA: floor_divide_ - SparseCPU: floor_divide_sparse_ - SparseCUDA: floor_divide_sparse_ + CPU, CUDA: floor_divide_ + SparseCPU, SparseCUDA: floor_divide_sparse_ - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: floor_divide_out - CUDA: floor_divide_out - SparseCPU: floor_divide_out_sparse_zerodim - SparseCUDA: floor_divide_out_sparse_zerodim + CPU, CUDA: floor_divide_out + SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full variants: function, method - func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: frac(Tensor self) -> Tensor @@ -1275,29 +1663,63 @@ variants: function, method - func: frac_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: frac_out - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_guard: False - func: full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: full.out(int[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!) - func: full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full dispatch: CPU: from_file +- func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: gcd_out + +- func: gcd(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + +- func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: lcm_out + +- func: lcm(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + # NOTE [ grid_sampler Native Functions ] # `grid_sampler` does all the shape checking and then dispatches to one of # `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which # has the corresponding backward defined as native functions as well. Therefore, # in these functions and their backwards, no more shape checking is done. # +# There is also _grid_sampler_2d_backward_cpu_fallback which is an +# implementation detail of grid_sampler_2d and is only exposed here for testing +# purposes. +# # Additionally, arguments `padding_mode` and `interpolation_mode` are cast to # enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in # `interpolation_mode` because it only supports Bilinear interpolation mode. @@ -1318,6 +1740,13 @@ CPU: grid_sampler_2d_backward_cpu CUDA: grid_sampler_2d_backward_cuda +# See NOTE [ grid_sample CPU fallback ] +- func: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor + use_c10_dispatcher: full + +- func: _grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor) + use_c10_dispatcher: full + - func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor use_c10_dispatcher: full dispatch: @@ -1331,43 +1760,48 @@ CUDA: grid_sampler_3d_backward_cuda - func: hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full -- func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor +- func: kaiser_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor use_c10_dispatcher: full -- func: ger(Tensor self, Tensor vec2) -> Tensor +- func: kaiser_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor use_c10_dispatcher: full - variants: function, method -- func: ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) +- func: kaiser_window.beta(int window_length, bool periodic, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full + +- func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor + use_c10_dispatcher: full - func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor + use_c10_dispatcher: full - func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, int N, int C, int HxW, int group, float eps) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: - CPU: native_group_norm - CUDA: native_group_norm + CPU, CUDA: native_group_norm + Math: math_group_norm - func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int N, int C, int HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - dispatch: - CPU: native_group_norm_backward - CUDA: native_group_norm_backward - -# FFT - -- func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor use_c10_dispatcher: full - variants: function, method + dispatch: + CPU, CUDA: native_group_norm_backward - func: ifft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor use_c10_dispatcher: full @@ -1384,6 +1818,10 @@ - func: _fft_with_size(Tensor self, int signal_ndim, bool complex_input, bool complex_output, bool inverse, int[] checked_signal_sizes, bool normalized, bool onesided, int[] output_sizes) -> Tensor use_c10_dispatcher: full variants: function + +- func: _fft_with_size.norm_modes(Tensor self, int signal_ndim, bool complex_input, bool complex_output, bool inverse, int[] checked_signal_sizes, int normalization, bool onesided, int[] output_sizes) -> Tensor + use_c10_dispatcher: full + variants: function dispatch: CPU: _fft_mkl CUDA: _fft_cufft @@ -1402,12 +1840,15 @@ - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor variants: function, method + dispatch: + CPU, CUDA: index # NB: This function is special-cased in tools/autograd/gen_variable_type.py # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp: # - Tensor Tensor::index(ArrayRef indices) # - Tensor Tensor::index(std::initializer_list indices) - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor @@ -1433,8 +1874,11 @@ - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!) variants: function + dispatch: + CPU, CUDA: _index_put_impl_ - func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor + use_c10_dispatcher: full variants: function - func: inverse(Tensor self) -> Tensor @@ -1459,10 +1903,8 @@ variants: function, method device_guard: False dispatch: - CPU: isnan - CUDA: isnan - SparseCPU: isnan_sparse - SparseCUDA: isnan_sparse + CPU, CUDA: isnan + SparseCPU, SparseCUDA: isnan_sparse - func: is_distributed(Tensor self) -> bool use_c10_dispatcher: full @@ -1479,6 +1921,10 @@ variants: function, method device_guard: False +- func: isreal(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + - func: is_nonzero(Tensor self) -> bool use_c10_dispatcher: full variants: function, method @@ -1518,21 +1964,26 @@ - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) - func: layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor + use_c10_dispatcher: full - func: native_layer_norm(Tensor input, Tensor? weight, Tensor? bias, int M, int N, float eps) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CPU: layer_norm_cpu CUDA: layer_norm_cuda - func: native_layer_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int M, int N, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CPU: layer_norm_backward_cpu CUDA: layer_norm_backward_cuda - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor + use_c10_dispatcher: full python_module: nn - func: mkldnn_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor + use_c10_dispatcher: full python_module: nn dispatch: MkldnnCPU: mkldnn_linear @@ -1561,9 +2012,10 @@ - func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor use_c10_dispatcher: full -- func: linspace(Scalar start, Scalar end, int steps=100, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full -- func: linspace.out(Scalar start, Scalar end, int steps=100, *, Tensor(a!) out) -> Tensor(a!) +- func: linspace.out(Scalar start, Scalar end, int? steps=None, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: linspace_cpu_out CUDA: linspace_cuda_out @@ -1573,63 +2025,64 @@ variants: function, method - func: log_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: log_out - CUDA: log_out + CPU, CUDA: log_out - func: log10(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: log10_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: log10_out - CUDA: log10_out + CPU, CUDA: log10_out - func: log1p(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: log1p_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method dispatch: - CPU: log1p_ - CUDA: log1p_ - SparseCPU: log1p_sparse_ - SparseCUDA: log1p_sparse_ + CPU, CUDA: log1p_ + SparseCPU, SparseCUDA: log1p_sparse_ - func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: log1p_out - CUDA: log1p_out - SparseCPU: log1p_out_sparse - SparseCUDA: log1p_out_sparse + CPU, CUDA: log1p_out + SparseCPU, SparseCUDA: log1p_out_sparse - func: log2(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: log2_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: log2_out - CUDA: log2_out + CPU, CUDA: log2_out - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logaddexp_out - func: logaddexp(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logaddexp2_out - func: logaddexp2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -1639,15 +2092,17 @@ use_c10_dispatcher: full variants: function, method -- func: logspace(Scalar start, Scalar end, int steps=100, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: logspace(Scalar start, Scalar end, int? steps=None, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full -- func: logspace.out(Scalar start, Scalar end, int steps=100, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +- func: logspace.out(Scalar start, Scalar end, int? steps=None, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: logspace_cpu_out CUDA: logspace_cuda_out # log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor @@ -1677,6 +2132,7 @@ CUDA: _logcumsumexp_out_cuda - func: logcumsumexp(Tensor self, int dim) -> Tensor + use_c10_dispatcher: full variants: function, method - func: logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) @@ -1716,24 +2172,61 @@ use_c10_dispatcher: full variants: function, method -- func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) +- func: matrix_exp(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: matrix_exp -- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) +- func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor + use_c10_dispatcher: full + +- func: _aminmax(Tensor self) -> (Tensor, Tensor) + use_c10_dispatcher: full + variants: function + dispatch: + CPU, CUDA: _aminmax_all + +- func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor) + use_c10_dispatcher: full + variants: function + dispatch: + CPU, CUDA: _aminmax + +- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor + dispatch: + CPU, CUDA: _compute_linear_combination -- func: max_values(Tensor self, int[1] dim, bool keepdim=False) -> Tensor +- func: _compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: _compute_linear_combination_out + +- func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) use_c10_dispatcher: full variants: function, method +- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU, CUDA: max_out + - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method - func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) -- func: max_values.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor +- func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, int[] sizes, bool keepdim) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False + +- func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor + use_c10_dispatcher: full variants: function, method +- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: amax_out + # Return: (Tensor output, Tensor indices) - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) use_c10_dispatcher: full @@ -1746,13 +2239,21 @@ - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor use_c10_dispatcher: full - requires_tensor: True dispatch: MkldnnCPU: mkldnn_max_pool2d +- func: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor + use_c10_dispatcher: full + dispatch: + MkldnnCPU: mkldnn_max_pool3d + +- func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor + use_c10_dispatcher: full + dispatch: + QuantizedCPU: quantized_max_pool1d + - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor use_c10_dispatcher: full - requires_tensor: True dispatch: QuantizedCPU: quantized_max_pool2d @@ -1762,25 +2263,23 @@ # The CPU and GPU dispatch variants are named weirdly here because otherwise there # are namespacing issues in C++ - func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method dispatch: - CPU: mean_cpu_gpu - CUDA: mean_cpu_gpu - QuantizedCPU: quantized_mean_cpu + CPU, CUDA: mean_cpu_gpu + QuantizedCPU: mean_quantized_cpu - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method dispatch: - CPU: mean_cpu_gpu - CUDA: mean_cpu_gpu - QuantizedCPU: quantized_mean_cpu - Vulkan: mean_vulkan + CPU, CUDA: mean_cpu_gpu + QuantizedCPU: mean_quantized_cpu - func: mean.out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: mean_out_cpu_gpu - CUDA: mean_out_cpu_gpu - QuantizedCPU: quantized_mean_out_cpu + CPU, CUDA: mean_out_cpu_gpu + QuantizedCPU: mean_out_quantized_cpu - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method @@ -1803,20 +2302,24 @@ variants: function, method - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) - -- func: min_values(Tensor self, int[1] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full - variants: function, method + dispatch: + CPU, CUDA: min_out - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method - func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) -- func: min_values.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor +- func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor + use_c10_dispatcher: full variants: function, method +- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: amin_out + - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor + use_c10_dispatcher: full - func: mkldnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> Tensor use_c10_dispatcher: full @@ -1828,14 +2331,17 @@ use_c10_dispatcher: full - func: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: miopen_batch_norm - func: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: miopen_batch_norm_backward - func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + use_c10_dispatcher: full dispatch: CUDA: miopen_convolution @@ -1860,6 +2366,7 @@ CUDA: miopen_convolution_backward_weight - func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_transpose @@ -1881,6 +2388,7 @@ CUDA: miopen_convolution_transpose_backward_weight - func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + use_c10_dispatcher: full dispatch: CUDA: miopen_depthwise_convolution @@ -1900,10 +2408,12 @@ CUDA: miopen_depthwise_convolution_backward_weight - func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: miopen_rnn - func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[]) + use_c10_dispatcher: full dispatch: CUDA: miopen_rnn_backward @@ -1913,15 +2423,13 @@ dispatch: CPU: mm_cpu CUDA: mm_cuda - SparseCPU: _sparse_mm - SparseCUDA: _sparse_mm + SparseCPU, SparseCUDA: _sparse_mm - func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: mm_cpu_out CUDA: mm_out_cuda - SparseCPU: _sparse_mm_out - SparseCUDA: _sparse_mm_out + SparseCPU, SparseCUDA: _sparse_mm_out - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor use_c10_dispatcher: full @@ -1929,6 +2437,8 @@ - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: mode - func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -1941,25 +2451,21 @@ use_c10_dispatcher: full variants: function, method dispatch: - CPU: mul - CUDA: mul - SparseCPU: mul_sparse - SparseCUDA: mul_sparse + CPU, CUDA: mul + SparseCPU, SparseCUDA: mul_sparse MkldnnCPU: mkldnn_mul - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: mul_ - CUDA: mul_ - SparseCPU: mul_sparse_ - SparseCUDA: mul_sparse_ + CPU, CUDA: mul_ + SparseCPU, SparseCUDA: mul_sparse_ MkldnnCPU: mkldnn_mul_ - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: mul_out - CUDA: mul_out + CPU, CUDA: mul_out SparseCPU: mul_out_sparse_cpu SparseCUDA: mul_out_sparse_cuda MkldnnCPU: mkldnn_mul_out @@ -1970,16 +2476,34 @@ variants: function, method - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +# multiply, alias for mul +- func: multiply.Tensor(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: multiply.Scalar(Tensor self, Scalar other) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: mv(Tensor self, Tensor vec) -> Tensor use_c10_dispatcher: full variants: function, method dispatch: - CPU: mv - CUDA: mv - SparseCPU: mv_sparse - SparseCUDA: mv_sparse + CPU, CUDA: mv + SparseCPU, SparseCUDA: mv_sparse - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!) @@ -1988,16 +2512,15 @@ variants: function, method - func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: narrow_copy(Tensor self, int dim, int start, int length) -> Tensor use_c10_dispatcher: full variants: method dispatch: - CPU: narrow_copy_dense - CUDA: narrow_copy_dense - SparseCPU: narrow_copy_sparse - SparseCUDA: narrow_copy_sparse + CPU, CUDA: narrow_copy_dense + SparseCPU, SparseCUDA: narrow_copy_sparse - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a) use_c10_dispatcher: full @@ -2010,6 +2533,7 @@ device_guard: False - func: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CPU: batch_norm_cpu CUDA: batch_norm_cuda @@ -2025,6 +2549,7 @@ CUDA: batch_norm_stats_cuda - func: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor + use_c10_dispatcher: full dispatch: CUDA: batch_norm_elemt_cuda @@ -2034,27 +2559,33 @@ # for backward compatibility - func: batch_norm_gather_stats(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count) -> (Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: batch_norm_gather_stats_cuda - func: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: batch_norm_gather_stats_with_counts_cuda - func: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CPU: batch_norm_backward_cpu CUDA: batch_norm_backward_cuda - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: batch_norm_backward_reduce_cuda - func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor mean_dy, Tensor mean_dy_xmu) -> Tensor + use_c10_dispatcher: full dispatch: CUDA: batch_norm_backward_elemt_cuda - func: batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor) + use_c10_dispatcher: full dispatch: CPU: batch_norm_update_stats_cpu CUDA: batch_norm_update_stats_cuda @@ -2066,6 +2597,7 @@ use_c10_dispatcher: full - func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor + use_c10_dispatcher: full variants: function - func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor) @@ -2084,10 +2616,12 @@ device_guard: False - func: ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: ones.out(int[] size, *, Tensor(a!) out) -> Tensor(a!) - func: ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor use_c10_dispatcher: full @@ -2100,18 +2634,26 @@ - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _cdist_forward - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _cdist_backward - func: pdist(Tensor self, float p=2) -> Tensor use_c10_dispatcher: full - func: _pdist_forward(Tensor self, float p=2) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _pdist_forward - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _pdist_backward - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor use_c10_dispatcher: full @@ -2121,6 +2663,14 @@ use_c10_dispatcher: full variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. +- func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) + use_c10_dispatcher: full + variants: function, method + +- func: movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a) + use_c10_dispatcher: full + variants: function, method + # Only exposed from C++ -- in Python, # we expose it as an attribute `T`, not a function. # @@ -2139,13 +2689,13 @@ use_c10_dispatcher: full dispatch: CPU: channel_shuffle - QuantizedCPU: quantized_channel_shuffle + QuantizedCPU: channel_shuffle_quantized_cpu - func: is_pinned(Tensor self) -> bool use_c10_dispatcher: full variants: method -- func: pin_memory(Tensor self) -> Tensor +- func: pin_memory(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full variants: method @@ -2160,28 +2710,25 @@ - func: rad2deg(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - supports_named_tensor: True - func: rad2deg_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - supports_named_tensor: True - func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - supports_named_tensor: True - func: deg2rad(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - supports_named_tensor: True - func: deg2rad_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - supports_named_tensor: True - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - supports_named_tensor: True - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: rand.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_guard: False @@ -2190,6 +2737,7 @@ device_guard: False - func: rand(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: rand.generator(int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2198,12 +2746,15 @@ - func: rand.generator_out(int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) - func: rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: randint(int high, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: randint.generator(int high, int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: randint.low(int low, int high, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: randint.low_generator(int low, int high, int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2216,10 +2767,13 @@ - func: randint.low_generator_out(int low, int high, int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) - func: randint_like(Tensor self, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: randint_like.low_dtype(Tensor self, int low, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: randn(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: randn.generator(int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2234,8 +2788,10 @@ - func: randn.generator_out(int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!) - func: randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: randperm(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: randperm.generator(int n, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2247,8 +2803,10 @@ CUDA: randperm_out_cuda - func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -2260,21 +2818,39 @@ variants: function, method - func: reciprocal_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: reciprocal_out - func: neg(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: neg_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: neg_ + SparseCPU, SparseCUDA: neg_sparse_ - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: neg_out - CUDA: neg_out + CPU, CUDA: neg_out + SparseCPU, SparseCUDA: neg_out_sparse + +# Alias for neg +- func: negative(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: negative_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + +- func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - func: repeat(Tensor self, int[] repeats) -> Tensor use_c10_dispatcher: full @@ -2295,7 +2871,7 @@ use_c10_dispatcher: full variants: function, method -- func: reshape(Tensor self, int[] shape) -> Tensor +- func: reshape(Tensor(a) self, int[] shape) -> Tensor(a) use_c10_dispatcher: full variants: function, method device_guard: False @@ -2303,11 +2879,10 @@ - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor use_c10_dispatcher: full device_guard: False - requires_tensor: True dispatch: MkldnnCPU: mkldnn_reshape -- func: reshape_as(Tensor self, Tensor other) -> Tensor +- func: reshape_as(Tensor(a) self, Tensor other) -> Tensor(a) use_c10_dispatcher: full variants: method device_guard: False @@ -2317,6 +2892,7 @@ variants: function, method - func: round_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -2332,18 +2908,17 @@ use_c10_dispatcher: full variants: function, method dispatch: - CPU: relu - CUDA: relu + CPU, CUDA: relu MkldnnCPU: mkldnn_relu - QuantizedCPU: quantized_relu + QuantizedCPU: relu_quantized_cpu - func: relu_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method dispatch: - CPU: relu_ - CUDA: relu_ + CPU, CUDA: relu_ MkldnnCPU: mkldnn_relu_ - QuantizedCPU: quantized_relu_ + QuantizedCPU: relu_quantized_cpu_ - func: prelu(Tensor self, Tensor weight) -> Tensor use_c10_dispatcher: full @@ -2373,25 +2948,35 @@ CPU: gelu_backward_cpu CUDA: gelu_backward_cuda +- func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function + python_module: nn + device_guard: False + - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: hardshrink - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: hardshrink_backward - func: rsqrt(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: rsqrt_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: rsqrt_out - CUDA: rsqrt_out + CPU, CUDA: rsqrt_out - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a) variants: function, method @@ -2402,54 +2987,98 @@ variants: function, method device_guard: False +- func: select_backward(Tensor grad, int[] input_sizes, int dim, int index) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False + - func: selu(Tensor self) -> Tensor use_c10_dispatcher: full - func: selu_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor use_c10_dispatcher: full - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!) + use_c10_dispatcher: full + +- func: silu(Tensor self) -> Tensor + use_c10_dispatcher: full + python_module: nn + +- func: silu_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full + python_module: nn + +- func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn + dispatch: + CPU, CUDA: silu_out + +- func: silu_backward(Tensor grad_output, Tensor self) -> Tensor + use_c10_dispatcher: full + python_module: nn - func: sigmoid(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method dispatch: - CPU: sigmoid - CUDA: sigmoid - QuantizedCPU: quantized_sigmoid + CPU, CUDA: sigmoid + QuantizedCPU: sigmoid_quantized_cpu MkldnnCPU: mkldnn_sigmoid - func: sigmoid_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method dispatch: - CPU: sigmoid_ - CUDA: sigmoid_ + CPU, CUDA: sigmoid_ MkldnnCPU: mkldnn_sigmoid_ - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sigmoid_out + +- func: logit(Tensor self, float? eps=None) -> Tensor + use_c10_dispatcher: full + variants: function, method + dispatch: + CPU, CUDA: logit + +- func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + dispatch: + CPU, CUDA: logit_ + +- func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logit_out - func: sin(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: sin_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: sin_out - CUDA: sin_out + CPU, CUDA: sin_out - func: sinh(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: sinh_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sinh_out # Returns a copy of this `Variable` that is detached from its autograd graph. # This method is OK to call if the `Variable` is a view. @@ -2462,16 +3091,15 @@ # to false to make such changes explicitly illegal, in order to prevent users from # changing metadata of the detached tensor and expecting the original tensor to also # be updated. -- func: detach(Tensor self) -> Tensor +- func: detach(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full - manual_kernel_registration: True variants: function, method # Like `detach()`, but modifies this `Variable` in-place. This method may # only be called on non-view `Variable`s. You can use `is_view()` to check # this. If this `Variable` is a view, throws an `std::runtime_error()`. - func: detach_(Tensor(a!) self) -> Tensor(a!) - manual_kernel_registration: True + use_c10_dispatcher: full variants: function, method - func: size.int(Tensor self, int dim) -> int @@ -2488,6 +3116,11 @@ variants: function, method device_guard: False +- func: slice_backward(Tensor grad, int[] input_sizes, int dim, int start, int end, int step) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False + - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet) use_c10_dispatcher: full variants: function, method @@ -2498,6 +3131,7 @@ # softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor @@ -2516,12 +3150,22 @@ CPU: softmax_backward_cpu CUDA: softmax_backward_cuda +- func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[] + use_c10_dispatcher: full + variants: function, method + device_guard: False + - func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[] use_c10_dispatcher: full variants: function, method device_guard: False -- func: split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[] +- func: unsafe_split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[] + use_c10_dispatcher: full + variants: function, method + device_guard: False + +- func: split_with_sizes(Tensor(a) self, int[] split_sizes, int dim=0) -> Tensor(a)[] use_c10_dispatcher: full variants: function, method device_guard: False @@ -2541,10 +3185,12 @@ device_guard: False - func: squeeze_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: method device_guard: False - func: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!) + use_c10_dispatcher: full variants: method device_guard: False @@ -2568,14 +3214,31 @@ - func: stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) +- func: hstack(Tensor[] tensors) -> Tensor + use_c10_dispatcher: full + +- func: hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +- func: vstack(Tensor[] tensors) -> Tensor + use_c10_dispatcher: full + +- func: vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + +- func: dstack(Tensor[] tensors) -> Tensor + use_c10_dispatcher: full + +- func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) + # The signature is designed to be consistent with librosa except that it is # missing the `pad_mode` and `center` arguments, which are taken care of at # `torch.functional.py`. They shall be moved here once we have mapping between # Python strings and C++ Enum in codegen. -- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool onesided=True) -> Tensor +- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor + use_c10_dispatcher: full variants: function, method -- func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool onesided=True, int? length=None) -> Tensor +- func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor + use_c10_dispatcher: full variants: function, method - func: stride.int(Tensor self, int dim) -> int @@ -2588,18 +3251,42 @@ device_guard: False - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: sum - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: sum - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sum_out - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +- func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full + variants: function, method + dispatch: + CPU, CUDA: nansum + +- func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full + variants: function, method + dispatch: + CPU, CUDA: nansum + +- func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nansum_out + - func: sum_to_size(Tensor self, int[] size) -> Tensor use_c10_dispatcher: full variants: method @@ -2610,37 +3297,51 @@ variants: function, method - func: sqrt_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sqrt_out - func: square(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: square_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: std(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: std - func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: std - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: std_mean - func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: std_mean - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) variants: function - func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: std_out - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor variants: function, method @@ -2648,12 +3349,20 @@ - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: prod - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: prod - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: prod_out - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method @@ -2666,6 +3375,7 @@ variants: function, method - func: t_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full device_guard: False variants: method @@ -2674,22 +3384,27 @@ variants: function, method - func: tan_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: tan_out - func: tanh(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method dispatch: - CPU: tanh - CUDA: tanh - QuantizedCPU: quantized_tanh + CPU, CUDA: tanh + QuantizedCPU: tanh_quantized_cpu - func: tanh_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: tanh_out - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor use_c10_dispatcher: full @@ -2702,9 +3417,10 @@ dispatch: CPU: threshold CUDA: threshold_cuda - QuantizedCPU: quantized_threshold + QuantizedCPU: threshold_quantized_cpu - func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!) + use_c10_dispatcher: full variants: function dispatch: CPU: threshold_ @@ -2734,17 +3450,17 @@ - func: _mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor use_c10_dispatcher: full device_guard: False - requires_tensor: True dispatch: MkldnnCPU: mkldnn_transpose - func: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) + use_c10_dispatcher: full variants: method device_guard: False - func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) + use_c10_dispatcher: full device_guard: False - requires_tensor: True dispatch: MkldnnCPU: mkldnn_transpose_ @@ -2775,7 +3491,7 @@ CPU: roll_cpu CUDA: roll_cuda -# default int[] value [0,1] should not add space after comma, since native_parse.py uses ', ' to split args +# default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args - func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor use_c10_dispatcher: full @@ -2793,48 +3509,28 @@ - func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor use_c10_dispatcher: full -- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor +- func: trunc(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - dispatch: - CPU: true_divide - CUDA: true_divide - SparseCPU: true_divide_sparse - SparseCUDA: true_divide_sparse - -- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - variants: method - dispatch: - CPU: true_divide_ - CUDA: true_divide_ - SparseCPU: true_divide_sparse_ - SparseCUDA: true_divide_sparse_ - -- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU: true_divide_out - CUDA: true_divide_out - SparseCPU: true_divide_out_sparse_zerodim - SparseCUDA: true_divide_out_sparse_zerodim -- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor +- func: trunc_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full variants: function, method -- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - variants: method +- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: trunc_out -- func: trunc(Tensor self) -> Tensor +# Alias for trunc +- func: fix(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method -- func: trunc_(Tensor(a!) self) -> Tensor(a!) +- func: fix_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method -- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU: trunc_out - CUDA: trunc_out +- func: fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - func: type_as(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -2892,6 +3588,7 @@ device_guard: False - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!) + use_c10_dispatcher: full variants: method device_guard: False @@ -2901,12 +3598,18 @@ - func: var(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: var - func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: var - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: var_out - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor variants: function, method @@ -2916,15 +3619,19 @@ - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: var_mean - func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: var_mean - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) variants: function -- func: view_as(Tensor self, Tensor other) -> Tensor +- func: view_as(Tensor(a) self, Tensor other) -> Tensor(a) use_c10_dispatcher: full variants: method device_guard: False @@ -2936,6 +3643,18 @@ use_c10_dispatcher: full variants: function, method +- func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: function + +- func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor + use_c10_dispatcher: full + variants: function + +- func: where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor + use_c10_dispatcher: full + variants: function + - func: where(Tensor condition) -> Tensor[] use_c10_dispatcher: full variants: function @@ -2943,6 +3662,8 @@ - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: _s_where - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor use_c10_dispatcher: full @@ -2974,10 +3695,12 @@ device_guard: False - func: zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: zeros.out(int[] size, *, Tensor(a!) out) -> Tensor(a!) - func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor use_c10_dispatcher: full @@ -3020,27 +3743,34 @@ - func: native_norm(Tensor self, Scalar p=2) -> Tensor use_c10_dispatcher: full dispatch: - SparseCPU: norm_sparse - SparseCUDA: norm_sparse + SparseCPU, SparseCUDA: norm_sparse + +- func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor + use_c10_dispatcher: full + dispatch: + SparseCPU, SparseCUDA: norm_sparse # TODO: reduce signatures down to one when optional args is available - func: _sparse_sum(Tensor self) -> Tensor use_c10_dispatcher: full - func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor + use_c10_dispatcher: full - func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor use_c10_dispatcher: full - func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor + use_c10_dispatcher: full - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor use_c10_dispatcher: full dispatch: - SparseCPU: _sparse_sum_backward_cpu - SparseCUDA: _sparse_sum_backward_cuda + SparseCPU: _sparse_sum_backward_cpu + SparseCUDA: _sparse_sum_backward_cuda - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function - func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor @@ -3050,12 +3780,16 @@ use_c10_dispatcher: full dispatch: SparseCPU: softmax_sparse_cpu + SparseCUDA: softmax_sparse_cuda - func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor + use_c10_dispatcher: full dispatch: SparseCPU: softmax_backward_sparse_cpu + SparseCUDA: softmax_backward_sparse_cuda - func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + use_c10_dispatcher: full variants: function - func: _sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor @@ -3065,12 +3799,16 @@ use_c10_dispatcher: full dispatch: SparseCPU: log_softmax_sparse_cpu + SparseCUDA: log_softmax_sparse_cuda - func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor + use_c10_dispatcher: full dispatch: SparseCPU: log_softmax_backward_sparse_cpu + SparseCUDA: log_softmax_backward_sparse_cuda - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor + use_c10_dispatcher: full variants: function, method - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor @@ -3078,6 +3816,7 @@ variants: function, method - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor + use_c10_dispatcher: full variants: function, method - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor @@ -3085,8 +3824,12 @@ variants: function, method - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: norm_out - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: norm_out - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor variants: function, method @@ -3124,68 +3867,44 @@ variants: function - func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full variants: function, method dispatch: - CPU: clone - CUDA: clone - SparseCPU: clone_sparse - SparseCUDA: clone_sparse + CPU, CUDA: clone + SparseCPU, SparseCUDA: clone_sparse MkldnnCPU: mkldnn_clone - QuantizedCPU: quantized_clone - QuantizedCUDA: quantized_clone + QuantizedCPU, QuantizedCUDA: quantized_clone - func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!) - manual_kernel_registration: True - variants: function, method - -- func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU: pow_out - CUDA: pow_out - SparseCPU: pow_out_sparse_scalar - SparseCUDA: pow_out_sparse_scalar - -- func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor use_c10_dispatcher: full variants: function, method - dispatch: - CPU: pow - CUDA: pow - SparseCPU: pow_sparse_scalar - SparseCUDA: pow_sparse_scalar - func: zero_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: method, function dispatch: - CPU: zero_ - CUDA: zero_ - SparseCPU: zero_sparse_ - SparseCUDA: zero_sparse_ + CPU, CUDA: zero_ + SparseCPU, SparseCUDA: zero_sparse_ MkldnnCPU: mkldnn_zero_ - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: sub_out - CUDA: sub_out - SparseCPU: sub_out_sparse - SparseCUDA: sub_out_sparse + CPU, CUDA: sub_out + SparseCPU, SparseCUDA: sub_out_sparse - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor use_c10_dispatcher: full variants: function, method dispatch: - CPU: sub - CUDA: sub - SparseCPU: sub_sparse - SparseCUDA: sub_sparse + CPU, CUDA: sub + SparseCPU, SparseCUDA: sub_sparse - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: sub_ - CUDA: sub_ - SparseCPU: sub_sparse_ - SparseCUDA: sub_sparse_ + CPU, CUDA: sub_ + SparseCPU, SparseCUDA: sub_sparse_ # For C++ only, until we have conversion from C++ numbers to Tensor - func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor @@ -3193,11 +3912,45 @@ variants: function, method - func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +# subtract, alias for sub +- func: subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + +- func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +# For C++ only, until we have conversion from C++ numbers to Tensor +- func: subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: rsub + +- func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: heaviside_out + +- func: heaviside(Tensor self, Tensor values) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!) + variants: method # For C++ only, until we have conversion from C++ numbers to Tensor - func: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor @@ -3224,12 +3977,12 @@ CUDA: addmm_cuda SparseCPU: addmm_sparse_dense_cpu SparseCUDA: addmm_sparse_dense_cuda - Vulkan: vulkan_addmm - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: legacy::cpu::_th_addmm_ + CPU: addmm_cpu_ CUDA: addmm__cuda # Warning! For whatever reason, the inplace sparse addmm is NON # broadcasting @@ -3255,12 +4008,6 @@ # using **Tensor** type, and thus lose autograd tracking on the actual method # they dispatch to, e.g., `sparse_coo_tensor_with_dims_and_tensors`. # -# The actual ctors `sparse_coo_tensor_with_dims` and `sparse_coo_tensor_with_dims_and_tensors`, -# on the other hand, need to create `SparseTensorImpl` and know nothing about -# how `VariableType`s work. So they need to be dispatched using Tensor types. -# We thus put `requires_tensor=True` to ensure that `VariableType` will unwrap -# the given variables and call with the Tensor type. -# # # Sparse Methods API Design # ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -3353,39 +4100,42 @@ # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given # the default would never make sense. -- func: sparse_coo_tensor.size(int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor +- func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + use_c10_dispatcher: full - func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full - func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full + +- func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> () + use_c10_dispatcher: full -- func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor +- func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + use_c10_dispatcher: full dispatch: - SparseCPU: new_with_dims_sparse - SparseCUDA: new_with_dims_sparse - requires_tensor: True + SparseCPU, SparseCUDA: new_with_dims_sparse -- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, int[] size, Tensor indices, Tensor values, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor +- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, int[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + use_c10_dispatcher: full dispatch: - SparseCPU: new_with_dims_and_tensor_sparse - SparseCUDA: new_with_dims_and_tensor_sparse - requires_tensor: True + SparseCPU, SparseCUDA: new_with_dims_and_tensor_sparse - func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - SparseCPU: sparse_resize_ - SparseCUDA: sparse_resize_ - requires_tensor: True + SparseCPU, SparseCUDA: sparse_resize_ - func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - SparseCPU: sparse_resize_and_clear_ - SparseCUDA: sparse_resize_and_clear_ - requires_tensor: True + SparseCPU, SparseCUDA: sparse_resize_and_clear_ - func: sparse_mask(Tensor self, Tensor mask) -> Tensor use_c10_dispatcher: full @@ -3393,16 +4143,13 @@ dispatch: SparseCPU: sparse_mask_cpu SparseCUDA: sparse_mask_cuda - requires_tensor: True - func: to_dense(Tensor self) -> Tensor use_c10_dispatcher: full variants: method dispatch: - SparseCPU: sparse_to_dense - SparseCUDA: sparse_to_dense + SparseCPU, SparseCUDA: sparse_to_dense MkldnnCPU: mkldnn_to_dense - requires_tensor: True - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor use_c10_dispatcher: full @@ -3411,9 +4158,7 @@ use_c10_dispatcher: full variants: method dispatch: - SparseCPU: sparse_dim_sparse - SparseCUDA: sparse_dim_sparse - requires_tensor: True + SparseCPU, SparseCUDA: sparse_dim_sparse device_guard: False # legacy method @@ -3421,18 +4166,14 @@ use_c10_dispatcher: full variants: method dispatch: - SparseCPU: sparse_dim_sparse - SparseCUDA: sparse_dim_sparse - requires_tensor: True + SparseCPU, SparseCUDA: sparse_dim_sparse device_guard: False - func: dense_dim(Tensor self) -> int use_c10_dispatcher: full variants: method dispatch: - SparseCPU: dense_dim_sparse - SparseCUDA: dense_dim_sparse - requires_tensor: True + SparseCPU, SparseCUDA: dense_dim_sparse device_guard: False # legacy method @@ -3440,18 +4181,14 @@ use_c10_dispatcher: full variants: method dispatch: - SparseCPU: dense_dim_sparse - SparseCUDA: dense_dim_sparse - requires_tensor: True + SparseCPU, SparseCUDA: dense_dim_sparse device_guard: False - func: _nnz(Tensor self) -> int use_c10_dispatcher: full variants: method dispatch: - SparseCPU: _nnz_sparse - SparseCUDA: _nnz_sparse - requires_tensor: True + SparseCPU, SparseCUDA: _nnz_sparse device_guard: False - func: coalesce(Tensor self) -> Tensor @@ -3460,83 +4197,68 @@ dispatch: SparseCPU: coalesce_sparse_cpu SparseCUDA: coalesce_sparse_cuda - requires_tensor: True - func: is_coalesced(Tensor self) -> bool use_c10_dispatcher: full variants: method dispatch: - SparseCPU: is_coalesced_sparse - SparseCUDA: is_coalesced_sparse - requires_tensor: True + SparseCPU, SparseCUDA: is_coalesced_sparse device_guard: False - func: _indices(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full variants: method dispatch: - SparseCPU: _indices_sparse - SparseCUDA: _indices_sparse - requires_tensor: True + SparseCPU, SparseCUDA: _indices_sparse device_guard: False - func: _values(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full variants: method dispatch: - SparseCPU: _values_sparse - SparseCUDA: _values_sparse - requires_tensor: True + SparseCPU, SparseCUDA: _values_sparse device_guard: False # This method doesn't do any check but only directly sets the flag. So it can be # a bit unsafe. Similar to _indices and _values, this is useful for implementing # custom sparse operations in Python/C++ extension. - func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - SparseCPU: _coalesced_sparse_ - SparseCUDA: _coalesced_sparse_ - requires_tensor: True + SparseCPU, SparseCUDA: _coalesced_sparse_ device_guard: False - func: indices(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full variants: method dispatch: - SparseCPU: indices_sparse - SparseCUDA: indices_sparse - requires_tensor: True + SparseCPU, SparseCUDA: indices_sparse device_guard: False - func: values(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full variants: method dispatch: - SparseCPU: values_sparse - SparseCUDA: values_sparse - requires_tensor: True + SparseCPU, SparseCUDA: values_sparse device_guard: False - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) dispatch: SparseCPU: hspmm_out_sparse_cpu SparseCUDA: hspmm_out_sparse_cuda - requires_tensor: True - func: hspmm(Tensor mat1, Tensor mat2) -> Tensor use_c10_dispatcher: full dispatch: SparseCPU: hspmm_sparse_cpu SparseCUDA: hspmm_sparse_cuda - requires_tensor: True - func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) + use_c10_dispatcher: full variants: function dispatch: - SparseCPU: copy_sparse_ - SparseCUDA: copy_sparse_ - requires_tensor: True + SparseCPU, SparseCUDA: copy_sparse_ - func: unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[] use_c10_dispatcher: full @@ -3549,15 +4271,13 @@ use_c10_dispatcher: full variants: method dispatch: - CPU: dense_to_sparse - CUDA: dense_to_sparse + CPU, CUDA: dense_to_sparse - func: to_sparse(Tensor self) -> Tensor use_c10_dispatcher: full variants: method dispatch: - CPU: dense_to_sparse - CUDA: dense_to_sparse + CPU, CUDA: dense_to_sparse - func: to_mkldnn(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3572,21 +4292,30 @@ dispatch: MkldnnCPU: mkldnn_reorder_conv2d_weight +- func: mkldnn_reorder_conv3d_weight(Tensor self, int[3] padding=0, int[3] stride=1, int[3] dilation=1, int groups=1) -> Tensor + use_c10_dispatcher: full + variants: function + python_module: nn + dispatch: + MkldnnCPU: mkldnn_reorder_conv3d_weight + - func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor use_c10_dispatcher: full - func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor + use_c10_dispatcher: full variants: function dispatch: - CPU: quantize_per_tensor - CUDA: quantize_per_tensor + CPU, CUDA: quantize_per_tensor - func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[] + use_c10_dispatcher: full variants: function dispatch: CPU: quantize_per_tensor_list_cpu - func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor + use_c10_dispatcher: full variants: function dispatch: CPU: quantize_per_channel_cpu @@ -3595,53 +4324,50 @@ use_c10_dispatcher: full variants: function, method dispatch: - QuantizedCPU: dequantize_quant - QuantizedCUDA: dequantize_quant + QuantizedCPU, QuantizedCUDA: dequantize_quant - func: dequantize.tensors(Tensor[] tensors) -> Tensor[] use_c10_dispatcher: full variants: function dispatch: - QuantizedCPU: dequantize_tensors_quant + QuantizedCPU: dequantize_tensors_quantized_cpu - func: q_scale(Tensor self) -> float use_c10_dispatcher: full variants: function, method dispatch: - QuantizedCPU: q_scale_quant - QuantizedCUDA: q_scale_quant + QuantizedCPU, QuantizedCUDA: q_scale_quant - func: q_zero_point(Tensor self) -> int use_c10_dispatcher: full variants: function, method dispatch: - QuantizedCPU: q_zero_point_quant - QuantizedCUDA: q_zero_point_quant + QuantizedCPU, QuantizedCUDA: q_zero_point_quant - func: q_per_channel_scales(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method dispatch: - QuantizedCPU: q_per_channel_scales_quant + QuantizedCPU, QuantizedCUDA: q_per_channel_scales - func: q_per_channel_zero_points(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method dispatch: - QuantizedCPU: q_per_channel_zero_points_quant + QuantizedCPU, QuantizedCUDA: q_per_channel_zero_points - func: q_per_channel_axis(Tensor self) -> int use_c10_dispatcher: full variants: function, method dispatch: - QuantizedCPU: q_per_channel_axis_quant + QuantizedCPU, QuantizedCUDA: q_per_channel_axis - func: int_repr(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method dispatch: - QuantizedCPU: int_repr_quant_cpu - QuantizedCUDA: int_repr_quant_cuda + QuantizedCPU: int_repr_quantized_cpu + QuantizedCUDA: int_repr_quantized_cuda - func: _make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor use_c10_dispatcher: full @@ -3658,45 +4384,80 @@ use_c10_dispatcher: full variants: method dispatch: - QuantizedCPU: qscheme_quant - QuantizedCUDA: qscheme_quant + QuantizedCPU, QuantizedCUDA: qscheme_quant - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: fake_quantize_per_tensor_affine - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function +- func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor + use_c10_dispatcher: full + variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_tensor_affine + +- func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full + variants: function + - func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: fake_quantize_per_channel_affine - func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function -- func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int) +- func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_channel_affine -# to(Device) must not exist because all constructors of Device also works for -# TensorOptions. Otherwise, an ambiguity error is thrown. -# See NOTE [ TensorOptions Constructors ]. -- func: to.dtype_layout(Tensor self, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor - variants: method - device_guard: False +- func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full + variants: function -- func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor - variants: method - device_guard: False +- func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int) + use_c10_dispatcher: full + variants: function + +- func: _saturate_weight_to_fp16(Tensor weight) -> Tensor + use_c10_dispatcher: full + variants: function + +- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (float, float) + use_c10_dispatcher: full + variants: function + +# to(Device) must not exist because all constructors of Device also works for +# TensorOptions. Otherwise, an ambiguity error is thrown. +# See NOTE [ TensorOptions Constructors ]. +- func: to.dtype_layout(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full + variants: method + device_guard: False + +- func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full + variants: method + device_guard: False - func: to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full variants: method device_guard: False - func: to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor + use_c10_dispatcher: full variants: method device_guard: False @@ -3716,20 +4477,26 @@ variants: method - func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType + use_c10_dispatcher: full variants: function - func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType + use_c10_dispatcher: full variants: function - func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType + use_c10_dispatcher: full variants: function - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType + use_c10_dispatcher: full - func: can_cast(ScalarType from, ScalarType to) -> bool + use_c10_dispatcher: full variants: function - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType + use_c10_dispatcher: full variants: function # NB: Does NOT check precondition that numel == 1 @@ -3742,16 +4509,20 @@ # Fused RNN kernels - func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: _thnn_fused_lstm_cell_cuda - func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: _thnn_fused_lstm_cell_backward_cuda - func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + use_c10_dispatcher: full - func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor) + use_c10_dispatcher: full dispatch: CUDA: _thnn_fused_gru_cell_cuda @@ -3761,6 +4532,7 @@ CUDA: _thnn_fused_gru_cell_backward_cuda - func: _thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + use_c10_dispatcher: full # RNN cells and layers - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor) @@ -3788,19 +4560,25 @@ use_c10_dispatcher: full - func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor) + use_c10_dispatcher: full - func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor + use_c10_dispatcher: full - func: rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor + use_c10_dispatcher: full - func: rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor + use_c10_dispatcher: full # Quantized RNN layer registration has been moved to C10 dispatch in `RNN.cpp` # Quantized RNN layers # - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) +# use_c10_dispatcher: full # - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) +# use_c10_dispatcher: full # Quantized GRU layers @@ -3839,8 +4617,7 @@ variants: method device_guard: False dispatch: - CPU: set_ - CUDA: set_ + CPU, CUDA: set_ - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!) variants: method @@ -3848,17 +4625,17 @@ dispatch: CPU: set_storage_cpu_ CUDA: set_storage_cuda_ - QuantizedCPU: set_storage_quantized_ - QuantizedCUDA: set_storage_quantized_ + QuantizedCPU, QuantizedCUDA: set_storage_quantized_ - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!) + use_c10_dispatcher: full variants: method device_guard: False dispatch: - CPU: set_tensor_ - CUDA: set_tensor_ + CPU, CUDA: set_tensor_ - func: set_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: set_cpu_ @@ -3867,18 +4644,17 @@ - func: set_quantizer_(Tensor(a!) self, ConstQuantizerPtr quantizer) -> Tensor(a!) variants: method dispatch: - QuantizedCPU: set_quantizer_ - QuantizedCUDA: set_quantizer_ + QuantizedCPU, QuantizedCUDA: set_quantizer_ - func: is_set_to(Tensor self, Tensor tensor) -> bool use_c10_dispatcher: full variants: method device_guard: False dispatch: - CPU: is_set_to - CUDA: is_set_to + CPU, CUDA: is_set_to - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: masked_fill__cpu @@ -3889,6 +4665,7 @@ variants: function, method - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: masked_fill__cpu @@ -3899,6 +4676,7 @@ variants: function, method - func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: masked_scatter__cpu @@ -3913,19 +4691,18 @@ variants: method device_guard: False dispatch: - CPU: view - CUDA: view + CPU, CUDA, QuantizedCPU, QuantizedCUDA: view MkldnnCPU: mkldnn_view - QuantizedCPU: view - QuantizedCUDA: view - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: legacy::cpu::_th_put_ CUDA: legacy::cuda::_th_put_ - func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: index_add_cpu_ @@ -3939,6 +4716,7 @@ variants: function, method - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: legacy::cpu::_th_index_fill_ @@ -3949,10 +4727,10 @@ variants: function, method - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: index_fill_ - CUDA: index_fill_ + CPU, CUDA: index_fill_ - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor use_c10_dispatcher: full @@ -3971,20 +4749,20 @@ variants: function, method - func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: scatter_ - CUDA: scatter_ + CPU, CUDA: scatter_ - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor use_c10_dispatcher: full variants: function, method - func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: scatter_fill_ - CUDA: scatter_fill_ + CPU, CUDA: scatter_fill_ - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor use_c10_dispatcher: full @@ -3996,11 +4774,23 @@ - func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor variants: function, method +- func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + dispatch: + CPU, CUDA: scatter_reduce_ + +- func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + dispatch: + CPU, CUDA: scatter_scalar_reduce_ + - func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: scatter_add_ - CUDA: scatter_add_ + CPU, CUDA: scatter_add_ - func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor use_c10_dispatcher: full @@ -4009,53 +4799,23 @@ - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor variants: function, method -- func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - variants: method - -- func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - variants: method - -- func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - variants: method - -- func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - variants: method - -- func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - variants: method - -- func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - variants: method - -- func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - variants: method - -- func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - variants: method - - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - variants: method - -- func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - variants: method - -- func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: - CPU: bitwise_and_out - CUDA: bitwise_and_out + CPU, CUDA: bitwise_and_out - func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: - CPU: bitwise_and_out - CUDA: bitwise_and_out + CPU, CUDA: bitwise_and_out - func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full @@ -4066,9 +4826,11 @@ variants: method, function - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: __and__.Scalar(Tensor self, Scalar other) -> Tensor @@ -4080,22 +4842,22 @@ variants: method, function - func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: __iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: - CPU: bitwise_or_out - CUDA: bitwise_or_out + CPU, CUDA: bitwise_or_out - func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: - CPU: bitwise_or_out - CUDA: bitwise_or_out + CPU, CUDA: bitwise_or_out - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full @@ -4106,9 +4868,11 @@ variants: method, function - func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: __or__.Scalar(Tensor self, Scalar other) -> Tensor @@ -4120,22 +4884,22 @@ variants: method, function - func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: __ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: - CPU: bitwise_xor_out - CUDA: bitwise_xor_out + CPU, CUDA: bitwise_xor_out - func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: - CPU: bitwise_xor_out - CUDA: bitwise_xor_out + CPU, CUDA: bitwise_xor_out - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full @@ -4146,9 +4910,11 @@ variants: method, function - func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor @@ -4160,148 +4926,162 @@ variants: method, function - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: __lshift__ - CUDA: __lshift__ + CPU, CUDA: __lshift__ - func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: __lshift__ - CUDA: __lshift__ + CPU, CUDA: __lshift__ - func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: __ilshift__ - CUDA: __ilshift__ + CPU, CUDA: __ilshift__ - func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: __ilshift__ - CUDA: __ilshift__ + CPU, CUDA: __ilshift__ - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: __rshift__ - CUDA: __rshift__ + CPU, CUDA: __rshift__ - func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: __rshift__ - CUDA: __rshift__ + CPU, CUDA: __rshift__ - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: __irshift__ - CUDA: __irshift__ + CPU, CUDA: __irshift__ - func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: __irshift__ - CUDA: __irshift__ + CPU, CUDA: __irshift__ - func: lgamma_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: _lgamma__cpu CUDA: _lgamma__cuda - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method + dispatch: + CPU, CUDA: atan2_ - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: tril_cpu_ CUDA: tril_cuda_ - func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: triu_cpu_ CUDA: triu_cuda_ - func: digamma_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: method + dispatch: + CPU, CUDA: digamma_ - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: legacy::cpu::_th_renorm_ CUDA: legacy::cuda::_th_renorm_ - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: pow_ - CUDA: pow_ + CPU, CUDA: pow_ - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: pow_ - CUDA: pow_ + CPU, CUDA: pow_ - func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: lerp_cpu_scalar_ CUDA: lerp_cuda_scalar_ - func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: lerp_cpu_tensor_ CUDA: lerp_cuda_tensor_ - func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: fmod_ CUDA: fmod_cuda_ - func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: fmod_ CUDA: fmod_cuda_ - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: remainder_ - CUDA: remainder_ + CPU, CUDA: remainder_ - func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: remainder_ - CUDA: remainder_ + CPU, CUDA: remainder_ - func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: - CPU: legacy::cpu::_th_addbmm_ + CPU: addbmm_cpu_ CUDA: addbmm__cuda - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) @@ -4317,31 +5097,48 @@ CUDA: addbmm_cuda - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: uniform_ - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: cauchy_ - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: log_normal_ - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: exponential_ - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: geometric_ # wrappers for TH functions @@ -4354,11 +5151,20 @@ use_c10_dispatcher: full variants: method, function +- func: diag_backward(Tensor grad, int[] input_sizes, int diagonal) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False + - func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cross_out - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: cross - func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -4379,11 +5185,13 @@ variants: method, function - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full dispatch: CPU: tril_indices_cpu CUDA: tril_indices_cuda - func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + use_c10_dispatcher: full dispatch: CPU: triu_indices_cpu CUDA: triu_indices_cuda @@ -4395,174 +5203,300 @@ CPU: legacy::cpu::_th_trace CUDA: trace_cuda +- func: trace_backward(Tensor grad, int[] sizes) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False + - func: ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: ne_out - CUDA: ne_out + CPU, CUDA: ne_out QuantizedCPU: ne_out_quantized_cpu - func: ne.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: ne - CUDA: ne + CPU, CUDA: ne QuantizedCPU: ne_quantized_cpu - func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: ne_out - CUDA: ne_out + CPU, CUDA: ne_out QuantizedCPU: ne_out_quantized_cpu - func: ne.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: ne - CUDA: ne + CPU, CUDA: ne QuantizedCPU: ne_quantized_cpu +- func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +# not_equal, alias for torch.ne +- func: not_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: not_equal.Scalar(Tensor self, Scalar other) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: not_equal.Tensor(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: eq_out - CUDA: eq_out + CPU, CUDA: eq_out QuantizedCPU: eq_out_quantized_cpu - func: eq.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: eq - CUDA: eq + CPU, CUDA: eq QuantizedCPU: eq_quantized_cpu - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: eq_out - CUDA: eq_out + CPU, CUDA: eq_out QuantizedCPU: eq_out_quantized_cpu - func: eq.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: eq - CUDA: eq + CPU, CUDA: eq QuantizedCPU: eq_quantized_cpu - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: ge_out - CUDA: ge_out + CPU, CUDA: ge_out QuantizedCPU: ge_out_quantized_cpu - func: ge.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: ge - CUDA: ge + CPU, CUDA: ge QuantizedCPU: ge_quantized_cpu - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: ge_out - CUDA: ge_out + CPU, CUDA: ge_out QuantizedCPU: ge_out_quantized_cpu - func: ge.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: ge - CUDA: ge + CPU, CUDA: ge QuantizedCPU: ge_quantized_cpu +- func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +# greater_equal, alias for torch.ge +- func: greater_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater_equal.Scalar(Tensor self, Scalar other) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater_equal.Tensor(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: le_out - CUDA: le_out + CPU, CUDA: le_out QuantizedCPU: le_out_quantized_cpu - func: le.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: le - CUDA: le + CPU, CUDA: le QuantizedCPU: le_quantized_cpu - func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: le_out - CUDA: le_out + CPU, CUDA: le_out QuantizedCPU: le_out_quantized_cpu - func: le.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: le - CUDA: le + CPU, CUDA: le QuantizedCPU: le_quantized_cpu +- func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +# less_equal, alias for torch.le +- func: less_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less_equal.Scalar(Tensor self, Scalar other) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less_equal.Tensor(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: gt_out - CUDA: gt_out + CPU, CUDA: gt_out QuantizedCPU: gt_out_quantized_cpu - func: gt.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: gt - CUDA: gt + CPU, CUDA: gt QuantizedCPU: gt_quantized_cpu - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: gt_out - CUDA: gt_out + CPU, CUDA: gt_out QuantizedCPU: gt_out_quantized_cpu - func: gt.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: gt - CUDA: gt + CPU, CUDA: gt QuantizedCPU: gt_quantized_cpu +- func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +# greater, alias for torch.gt +- func: greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater.Scalar(Tensor self, Scalar other) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: greater.Tensor(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: lt_out - CUDA: lt_out + CPU, CUDA: lt_out QuantizedCPU: lt_out_quantized_cpu - func: lt.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: lt - CUDA: lt + CPU, CUDA: lt QuantizedCPU: lt_quantized_cpu - func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: lt_out - CUDA: lt_out + CPU, CUDA: lt_out QuantizedCPU: lt_out_quantized_cpu - func: lt.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: lt - CUDA: lt + CPU, CUDA: lt QuantizedCPU: lt_quantized_cpu +- func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +# less, alias for torch.lt +- func: less.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less.Scalar(Tensor self, Scalar other) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: less.Tensor(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + +- func: less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) + use_c10_dispatcher: full + variants: method + - func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: legacy::cpu::_th_take_out @@ -4575,17 +5509,22 @@ CPU: legacy::cpu::_th_take CUDA: legacy::cuda::_th_take -- func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!) +- func: take_backward(Tensor grad, Tensor input, Tensor index) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False + +- func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: index_select_out_cpu_ - CUDA: legacy::cuda::_th_index_select_out + CUDA: index_select_out_cuda - func: index_select(Tensor self, int dim, Tensor index) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: CPU: index_select_cpu_ - CUDA: legacy::cuda::_th_index_select + CUDA: index_select_cuda SparseCPU: index_select_sparse SparseCUDA: index_select_sparse @@ -4594,6 +5533,11 @@ - func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor variants: method, function +- func: index_select_backward(Tensor grad, int[] self_sizes, int dim, Tensor index) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False + - func: masked_select.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: masked_select_out_cpu @@ -4606,17 +5550,22 @@ CPU: masked_select_cpu CUDA: masked_select_cuda +- func: masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False + - func: nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: legacy::cpu::_th_nonzero_out - CUDA: legacy::cuda::_th_nonzero_out + CUDA: nonzero_out_cuda - func: nonzero(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_nonzero - CUDA: legacy::cuda::_th_nonzero + CUDA: nonzero_cuda - func: nonzero_numpy(Tensor self) -> Tensor[] use_c10_dispatcher: full @@ -4631,8 +5580,12 @@ use_c10_dispatcher: full variants: method, function dispatch: - CPU: gather - CUDA: gather + CPU, CUDA: gather + +- func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor + use_c10_dispatcher: full + variants: function + device_guard: False - func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) @@ -4643,15 +5596,20 @@ use_c10_dispatcher: full - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addcmul_out - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor use_c10_dispatcher: full variants: method, function - func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addcdiv_out - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor use_c10_dispatcher: full @@ -4839,14 +5797,12 @@ # TODO: remove dispatch section when porting TH CUDA to ATen - func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: multinomial_out - CUDA: multinomial_out + CPU, CUDA: multinomial_out - func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor variants: method, function dispatch: - CPU: multinomial - CUDA: multinomial + CPU, CUDA: multinomial - func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor) use_c10_dispatcher: full @@ -4870,16 +5826,21 @@ use_c10_dispatcher: full variants: method, function dispatch: - CPU: lgamma - CUDA: lgamma + CPU, CUDA: lgamma - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: digamma_out - func: digamma(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: digamma - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: polygamma_out - func: polygamma(int n, Tensor self) -> Tensor use_c10_dispatcher: full @@ -4889,10 +5850,10 @@ use_c10_dispatcher: full variants: method, function dispatch: - CPU: erfinv - CUDA: erfinv + CPU, CUDA: erfinv - func: erfinv_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: method dispatch: CPU: _erfinv__cpu @@ -4903,27 +5864,52 @@ CPU: _erfinv_out_cpu CUDA: _erfinv_out_cuda +- func: i0(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: i0_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + +- func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: i0_out + - func: sign(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - func: sign_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full variants: method - func: sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: sign_out - CUDA: sign_out + CPU, CUDA: sign_out + +- func: signbit(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: signbit_out + CUDA: signbit_out - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor use_c10_dispatcher: full variants: method, function - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: atan2_out - func: atan2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: atan2 - func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -4985,57 +5971,101 @@ CPU: fmod CUDA: fmod_cuda +- func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: hypot_out + +- func: hypot(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: method, function + dispatch: + CPU, CUDA: hypot + +- func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + +- func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nextafter_out + +- func: nextafter(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: method, function + dispatch: + CPU, CUDA: nextafter + +- func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!) + variants: method + - func: remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: remainder_out - CUDA: remainder_out + CPU, CUDA: remainder_out - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: remainder - CUDA: remainder + CPU, CUDA: remainder - func: remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: remainder_out - CUDA: remainder_out + CPU, CUDA: remainder_out - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: remainder - CUDA: remainder + CPU, CUDA: remainder -- func: min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +- func: min(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: method, function + dispatch: + CPU, CUDA: min + QuantizedCPU: min_quantized_cpu -- func: min.other(Tensor self, Tensor other) -> Tensor +- func: max(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: max + QuantizedCPU: max_quantized_cpu -- func: min(Tensor self) -> Tensor +- func: maximum(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: min - CUDA: min - QuantizedCPU: min_quant + CPU, CUDA: maximum -- func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +- func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: maximum_out +# binary max, alias of maximum +# NOTE: max is not an alias for maximum, since there is also unary max - func: max.other(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function -- func: max(Tensor self) -> Tensor +- func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: minimum(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: max - CUDA: max - QuantizedCPU: max_quant + CPU, CUDA: minimum + +- func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: minimum_out + +# binary min, alias for minimum +# NOTE: min is not an alias for minimum, since there is also unary min +- func: min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + +- func: min.other(Tensor self, Tensor other) -> Tensor + use_c10_dispatcher: full + variants: method, function - func: median(Tensor self) -> Tensor use_c10_dispatcher: full @@ -5044,18 +6074,42 @@ CPU: median_cpu CUDA: median_cuda +- func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + +- func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + +- func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + +- func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor + use_c10_dispatcher: full + variants: method, function + +- func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + +- func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor + use_c10_dispatcher: full + variants: method, function + - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) dispatch: - CPU: legacy::cpu::_th_sort_out + CPU: sort_out_cpu CUDA: legacy::cuda::_th_sort_out - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) use_c10_dispatcher: full variants: method, function dispatch: - CPU: legacy::cpu::_th_sort + CPU: sort_cpu CUDA: legacy::cuda::_th_sort - QuantizedCPU: sort_quant + QuantizedCPU: sort_quantized_cpu - func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -5069,7 +6123,7 @@ - func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor variants: method, function -- func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) ->(Tensor(a!) values, Tensor(b!) indices) +- func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) dispatch: CPU: topk_out_cpu CUDA: legacy::cuda::_th_topk_out @@ -5078,22 +6132,21 @@ use_c10_dispatcher: full variants: method, function dispatch: - CPU: topk - CUDA: topk - QuantizedCPU: quantized_topk_cpu + CPU, CUDA: topk + QuantizedCPU: topk_quantized_cpu - func: all(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: all - func: any(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: any - CUDA: any - SparseCPU: any_sparse - SparseCUDA: any_sparse + CPU, CUDA: any + SparseCPU, SparseCUDA: any_sparse - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -5112,62 +6165,82 @@ variants: method device_guard: False dispatch: - CPU: unfold - CUDA: unfold - QuantizedCPU: unfold - QuantizedCUDA: unfold + CPU, CUDA: unfold + QuantizedCPU, QuantizedCUDA: unfold - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor + use_c10_dispatcher: full variants: function dispatch: - CPU: unfold_backward - CUDA: unfold_backward + CPU, CUDA: unfold_backward - func: equal(Tensor self, Tensor other) -> bool use_c10_dispatcher: full variants: method, function dispatch: - CPU: legacy::cpu::_th_equal - CUDA: legacy::cuda::_th_equal - QuantizedCPU: quantized_equal_cpu + CPU: cpu_equal + CUDA: cuda_equal + QuantizedCPU: equal_quantized_cpu - func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: pow_out - CUDA: pow_out + CPU, CUDA: pow_out - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: pow - CUDA: pow + CPU, CUDA: pow - func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: pow_out - CUDA: pow_out + CPU, CUDA: pow_out - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor use_c10_dispatcher: full dispatch: - CPU: pow - CUDA: pow + CPU, CUDA: pow + +- func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: pow_out + SparseCPU, SparseCUDA: pow_out_sparse_scalar + +- func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor + use_c10_dispatcher: full + variants: function, method + dispatch: + CPU, CUDA: pow + SparseCPU, SparseCUDA: pow_sparse_scalar - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: normal_ - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -5177,23 +6250,8 @@ use_c10_dispatcher: full variants: method, function -- func: _addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full - dispatch: - CPU: legacy::cpu::_th_addr - CUDA: addr_cuda - -- func: _addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - dispatch: - CPU: legacy::cpu::_th_addr_ - CUDA: addr__cuda - -- func: _addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU: legacy::cpu::_th_addr_out - CUDA: addr_out_cuda - - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) + use_c10_dispatcher: full dispatch: CPU: legacy::cpu::_th_index_copy_ CUDA: legacy::cuda::_th_index_copy_ @@ -5215,33 +6273,301 @@ CPU: _cumprod_cpu CUDA: _cumprod_cuda -- func: _cumprod.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) +- func: _cumprod.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: _cumprod_out_cpu + CUDA: _cumprod_out_cuda + +- func: _var(Tensor self, bool unbiased=True) -> Tensor + use_c10_dispatcher: full + dispatch: + CPU: legacy::cpu::_th_var + +- func: _std(Tensor self, bool unbiased=True) -> Tensor + use_c10_dispatcher: full + dispatch: + CPU: legacy::cpu::_th_std + +- func: _amp_non_finite_check_and_unscale_(Tensor(a!) self, Tensor(b!) found_inf, Tensor inv_scale) -> () + use_c10_dispatcher: full + variants: function + dispatch: + CUDA: _amp_non_finite_check_and_unscale_cuda_ + +- func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor + use_c10_dispatcher: full + variants: function + dispatch: + CUDA: _amp_update_scale_cuda + +- func: _cat(Tensor[] tensors, int dim=0) -> Tensor + use_c10_dispatcher: full + dispatch: + CPU: _cat_cpu + CUDA: cat_cuda + QuantizedCPU: cat_quantized_cpu + +- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU: _cat_out_cpu + CUDA: cat_out_cuda + QuantizedCPU: cat_out_quantized_cpu + +- func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_add_scalar_kernel_slow + CUDA: foreach_tensor_add_scalar_kernel_cuda + +- func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_add_scalar_kernel_slow_ + CUDA: foreach_tensor_add_scalar_kernel_cuda_ + +- func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sub_scalar_kernel_slow + CUDA: foreach_tensor_sub_scalar_kernel_cuda + +- func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sub_scalar_kernel_slow_ + CUDA: foreach_tensor_sub_scalar_kernel_cuda_ + +- func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_mul_scalar_kernel_slow + CUDA: foreach_tensor_mul_scalar_kernel_cuda + +- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_mul_scalar_kernel_slow_ + CUDA: foreach_tensor_mul_scalar_kernel_cuda_ + +- func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_div_scalar_kernel_slow + CUDA: foreach_tensor_div_scalar_kernel_cuda + +- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_div_scalar_kernel_slow_ + CUDA: foreach_tensor_div_scalar_kernel_cuda_ + +- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_add_list_kernel_slow + CUDA: foreach_tensor_add_list_kernel_cuda + +- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_add_list_kernel_slow_ + CUDA: foreach_tensor_add_list_kernel_cuda_ + +- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sub_list_kernel_slow + CUDA: foreach_tensor_sub_list_kernel_cuda + +- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sub_list_kernel_slow_ + CUDA: foreach_tensor_sub_list_kernel_cuda_ + +- func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_mul_list_kernel_slow + CUDA: foreach_tensor_mul_list_kernel_cuda + +- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_mul_list_kernel_slow_ + CUDA: foreach_tensor_mul_list_kernel_cuda_ + +- func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_div_list_kernel_slow + CUDA: foreach_tensor_div_list_kernel_cuda + +- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_div_list_kernel_slow_ + CUDA: foreach_tensor_div_list_kernel_cuda_ + +- func: _foreach_add_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_add_scalarlist_kernel_slow + CUDA: foreach_tensor_add_scalarlist_kernel_cuda + +- func: _foreach_add_scalar_list_(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_add_scalarlist_kernel_slow_ + CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ + +- func: _foreach_sub_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sub_scalarlist_kernel_slow + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda + +- func: _foreach_sub_scalar_list_(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sub_scalarlist_kernel_slow_ + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ + +- func: _foreach_div_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_div_scalarlist_kernel_slow + CUDA: foreach_tensor_div_scalarlist_kernel_cuda + +- func: _foreach_div_scalar_list_(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_div_scalarlist_kernel_slow_ + CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ + +- func: _foreach_mul_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_mul_scalarlist_kernel_slow + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda + +- func: _foreach_mul_scalar_list_(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_mul_scalarlist_kernel_slow_ + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ + +- func: _foreach_exp(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_exp_slow + CUDA: foreach_tensor_exp_cuda + +- func: _foreach_exp_(Tensor(a!)[] self) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_exp_slow_ + CUDA: foreach_tensor_exp_cuda_ + +- func: _foreach_sqrt(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sqrt_slow + CUDA: foreach_tensor_sqrt_cuda + +- func: _foreach_sqrt_(Tensor(a!)[] self) -> () + use_c10_dispatcher: full + device_guard: False + variants: function dispatch: - CPU: _cumprod_out_cpu - CUDA: _cumprod_out_cuda + CPU: foreach_tensor_sqrt_slow_ + CUDA: foreach_tensor_sqrt_cuda_ -- func: _amp_non_finite_check_and_unscale_(Tensor(a!) self, Tensor(b!) found_inf, Tensor inv_scale) -> () +- func: _foreach_addcdiv_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () + use_c10_dispatcher: full + device_guard: False variants: function dispatch: - CUDA: _amp_non_finite_check_and_unscale_cuda_ + CPU: foreach_tensor_addcdiv_slow_ + CUDA: foreach_tensor_addcdiv_cuda_ -- func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor +- func: _foreach_addcmul_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () + use_c10_dispatcher: full + device_guard: False variants: function dispatch: - CUDA: _amp_update_scale_cuda + CPU: foreach_tensor_addcmul_slow_ + CUDA: foreach_tensor_addcmul_cuda_ -- func: _cat(Tensor[] tensors, int dim=0) -> Tensor +- func: _foreach_addcdiv(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] use_c10_dispatcher: full + device_guard: False + variants: function dispatch: - CPU: _cat_cpu - CUDA: cat_cuda - QuantizedCPU: quantized_cat + CPU: foreach_tensor_addcdiv_slow + CUDA: foreach_tensor_addcdiv_cuda -- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) +- func: _foreach_addcmul(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function dispatch: - CPU: _cat_out_cpu - CUDA: cat_out_cuda - QuantizedCPU: quantized_cat_out + CPU: foreach_tensor_addcmul_slow + CUDA: foreach_tensor_addcmul_cuda - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor) use_c10_dispatcher: full @@ -5292,23 +6618,25 @@ - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: mse_loss_out - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: mse_loss - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: - CPU: mse_loss_backward_out - CUDA: mse_loss_backward_out + CPU, CUDA: mse_loss_backward_out - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor use_c10_dispatcher: full python_module: nn dispatch: - CPU: mse_loss_backward - CUDA: mse_loss_backward + CPU, CUDA: mse_loss_backward - func: l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -5320,8 +6648,7 @@ - func: l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: - CPU: l1_loss_backward_out - CUDA: l1_loss_backward_out + CPU, CUDA: l1_loss_backward_out - func: l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor use_c10_dispatcher: full @@ -5334,6 +6661,7 @@ CUDA: legacy::cuda::_thnn_multi_margin_loss_forward_out - func: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor + use_c10_dispatcher: full python_module: nn dispatch: CPU: multi_margin_loss_cpu @@ -5346,6 +6674,7 @@ CUDA: legacy::cuda::_thnn_multi_margin_loss_backward_out - func: multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor + use_c10_dispatcher: full python_module: nn dispatch: CPU: multi_margin_loss_cpu_backward @@ -5388,6 +6717,7 @@ python_module: nn - func: nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor + use_c10_dispatcher: full python_module: nn - func: nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!)) @@ -5397,6 +6727,7 @@ CUDA: legacy::cuda::_thnn_nll_loss_forward_out - func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight) + use_c10_dispatcher: full python_module: nn dispatch: CPU: nll_loss_forward_cpu @@ -5409,6 +6740,7 @@ CUDA: legacy::cuda::_thnn_nll_loss_backward_out - func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor + use_c10_dispatcher: full python_module: nn dispatch: CPU: nll_loss_backward_cpu @@ -5418,6 +6750,7 @@ python_module: nn - func: nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor + use_c10_dispatcher: full python_module: nn - func: nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!)) @@ -5427,6 +6760,7 @@ CUDA: legacy::cuda::_thnn_nll_loss2d_forward_out - func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight) + use_c10_dispatcher: full python_module: nn dispatch: CPU: nll_loss2d_forward_cpu @@ -5439,28 +6773,31 @@ CUDA: legacy::cuda::_thnn_nll_loss2d_backward_out - func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor + use_c10_dispatcher: full python_module: nn dispatch: CPU: nll_loss2d_backward_cpu CUDA: legacy::cuda::_thnn_nll_loss2d_backward -- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) +- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: smooth_l1_loss_out CUDA: smooth_l1_loss_out -- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor +- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: smooth_l1_loss -- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) +- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: smooth_l1_loss_backward_out CUDA: smooth_l1_loss_backward_out -- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor +- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor use_c10_dispatcher: full python_module: nn @@ -5480,22 +6817,28 @@ - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: elu_out - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: elu - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: - CPU: elu_backward_out - CUDA: elu_backward_out + CPU, CUDA: elu_backward_out - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: elu_backward - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) + use_c10_dispatcher: full python_module: nn - func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!) @@ -5526,100 +6869,107 @@ - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_out - func: hardsigmoid(Tensor self) -> Tensor use_c10_dispatcher: full python_module: nn dispatch: - CPU: hardsigmoid - CUDA: hardsigmoid - QuantizedCPU: quantized_hardsigmoid + CPU, CUDA: hardsigmoid + QuantizedCPU: hardsigmoid_quantized_cpu - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_ - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full python_module: nn dispatch: - CPU: hardsigmoid_backward - CUDA: hardsigmoid_backward + CPU, CUDA: hardsigmoid_backward - func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: - CPU: hardtanh_out - CUDA: hardtanh_out - QuantizedCPU: quantized_hardtanh_out + CPU, CUDA: hardtanh_out + QuantizedCPU: hardtanh_out_quantized_cpu - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor use_c10_dispatcher: full python_module: nn dispatch: - CPU: hardtanh - CUDA: hardtanh - QuantizedCPU: quantized_hardtanh + CPU, CUDA: hardtanh + QuantizedCPU: hardtanh_quantized_cpu - func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: - CPU: hardtanh_backward_out - CUDA: hardtanh_backward_out + CPU, CUDA: hardtanh_backward_out - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardtanh_backward - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) + use_c10_dispatcher: full python_module: nn dispatch: - CPU: hardtanh_ - CUDA: hardtanh_ - QuantizedCPU: quantized_hardtanh_ - Vulkan: vulkan_hardtanh_ + CPU, CUDA: hardtanh_ + QuantizedCPU: hardtanh_quantized_cpu_ - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: hardswish_out - func: hardswish(Tensor self) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardswish - func: hardswish_(Tensor(a!) self) -> Tensor(a!) + use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardswish_ - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full python_module: nn dispatch: - CPU: hardswish_backward - CUDA: hardswish_backward + CPU, CUDA: hardswish_backward - func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: - CPU: leaky_relu_out - CUDA: leaky_relu_out - QuantizedCPU: quantized_leaky_relu_out + CPU, CUDA: leaky_relu_out + QuantizedCPU: leaky_relu_out_quantized_cpu - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor use_c10_dispatcher: full python_module: nn dispatch: - CPU: leaky_relu - CUDA: leaky_relu - QuantizedCPU: quantized_leaky_relu + CPU, CUDA: leaky_relu + QuantizedCPU: heaky_relu_quantized_cpu - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: leaky_relu_backward - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) + use_c10_dispatcher: full python_module: nn dispatch: - CPU: leaky_relu_ - CUDA: leaky_relu_ - QuantizedCPU: quantized_leaky_relu_ + CPU, CUDA: leaky_relu_ + QuantizedCPU: leaky_relu_quantized_cpu_ - func: log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -5678,43 +7028,52 @@ - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: softplus_out - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softplus - func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: - CPU: softplus_backward_out - CUDA: softplus_backward_out + CPU, CUDA: softplus_backward_out - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softplus_backward - func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: softshrink_out - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softshrink - func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: - CPU: softshrink_backward_out - CUDA: softshrink_backward_out + CPU, CUDA: softshrink_backward_out - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softshrink_backward - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: - CPU: adaptive_avg_pool2d_out_cpu - CUDA: adaptive_avg_pool2d_out_cuda + CPU, CUDA: adaptive_avg_pool2d_out_cpu MkldnnCPU: mkldnn_adaptive_avg_pool2d_out - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor @@ -5725,14 +7084,13 @@ use_c10_dispatcher: full dispatch: MkldnnCPU: mkldnn_adaptive_avg_pool2d - requires_tensor: True - func: _adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor use_c10_dispatcher: full dispatch: CPU: adaptive_avg_pool2d_cpu CUDA: adaptive_avg_pool2d_cuda - QuantizedCPU: quantized_adaptive_avg_pool2d + QuantizedCPU: adaptive_avg_pool2d_quantized_cpu - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -5746,6 +7104,7 @@ dispatch: CPU: adaptive_avg_pool3d_out_cpu CUDA: adaptive_avg_pool3d_out_cuda + QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu - func: adaptive_avg_pool3d(Tensor self, int[3] output_size) -> Tensor use_c10_dispatcher: full @@ -5753,6 +7112,7 @@ dispatch: CPU: adaptive_avg_pool3d_cpu CUDA: adaptive_avg_pool3d_cuda + QuantizedCPU: adaptive_avg_pool3d_quantized_cpu - func: adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -5837,7 +7197,7 @@ CPU: avg_pool2d_cpu CUDA: avg_pool2d_cuda MkldnnCPU: mkldnn_avg_pool2d - QuantizedCPU: quantized_avg_pool2d + QuantizedCPU: avg_pool2d_quantized_cpu - func: avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -5857,6 +7217,7 @@ dispatch: CPU: avg_pool3d_out_cpu CUDA: avg_pool3d_out_cuda + MkldnnCPU: mkldnn_avg_pool3d_out - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor use_c10_dispatcher: full @@ -5864,7 +7225,8 @@ dispatch: CPU: avg_pool3d_cpu CUDA: avg_pool3d_cuda - QuantizedCPU: quantized_avg_pool3d + MkldnnCPU: mkldnn_avg_pool3d + QuantizedCPU: avg_pool3d_quantized_cpu - func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6174,6 +7536,108 @@ CPU: replication_pad3d_backward_cpu CUDA: replication_pad3d_backward_cuda +- func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_linear1d_cpu + CUDA: upsample_linear1d_cuda + +- func: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_linear1d_backward_cpu + CUDA: upsample_linear1d_backward_cuda + +- func: upsample_bilinear2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_bilinear2d_cpu + CUDA: upsample_bilinear2d_cuda + QuantizedCPU: upsample_bilinear2d_quantized_cpu + +- func: upsample_bilinear2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_bilinear2d_backward_cpu + CUDA: upsample_bilinear2d_backward_cuda + +- func: upsample_trilinear3d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_trilinear3d_cpu + CUDA: upsample_trilinear3d_cuda + +- func: upsample_trilinear3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_trilinear3d_backward_cpu + CUDA: upsample_trilinear3d_backward_cuda + +- func: upsample_bicubic2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_bicubic2d_cpu + CUDA: upsample_bicubic2d_cuda + +- func: upsample_bicubic2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_bicubic2d_backward_cpu + CUDA: upsample_bicubic2d_backward_cuda + +- func: upsample_nearest1d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_nearest1d_cpu + CUDA: upsample_nearest1d_cuda + +- func: upsample_nearest1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_nearest1d_backward_cpu + CUDA: upsample_nearest1d_backward_cuda + +- func: upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_nearest2d_cpu + CUDA: upsample_nearest2d_cuda + QuantizedCPU: upsample_nearest2d_quantized_cpu + +- func: upsample_nearest2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_nearest2d_backward_cpu + CUDA: upsample_nearest2d_backward_cuda + +- func: upsample_nearest3d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_nearest3d_cpu + CUDA: upsample_nearest3d_cuda + QuantizedCPU: upsample_nearest3d_quantized_cpu + +- func: upsample_nearest3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: upsample_nearest3d_backward_cpu + CUDA: upsample_nearest3d_backward_cuda + +# NOTE: all of the non-"vec" upsample overloads are only kept for backward compatibility. - func: upsample_linear1d.out(Tensor self, int[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: @@ -6212,7 +7676,7 @@ dispatch: CPU: upsample_bilinear2d_cpu CUDA: upsample_bilinear2d_cuda - QuantizedCPU: quantized_upsample_bilinear2d_cpu + QuantizedCPU: upsample_bilinear2d_quantized_cpu - func: upsample_bilinear2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6317,8 +7781,7 @@ dispatch: CPU: upsample_nearest2d_cpu CUDA: upsample_nearest2d_cuda - QuantizedCPU: quantized_upsample_nearest2d_cpu - Vulkan: upsample_nearest2d_vulkan + QuantizedCPU: upsample_nearest2d_quantized_cpu - func: upsample_nearest2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6345,7 +7808,7 @@ dispatch: CPU: upsample_nearest3d_cpu CUDA: upsample_nearest3d_cuda - QuantizedCPU: quantized_upsample_nearest3d_cpu + QuantizedCPU: upsample_nearest3d_quantized_cpu - func: upsample_nearest3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6363,22 +7826,35 @@ - func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: - CPU: sigmoid_backward_out - CUDA: sigmoid_backward_out + CPU, CUDA: sigmoid_backward_out - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: sigmoid_backward + +- func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU, CUDA: logit_backward_out + +- func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU, CUDA: logit_backward - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: - CPU: tanh_backward_out - CUDA: tanh_backward_out + CPU, CUDA: tanh_backward_out - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: tanh_backward # What's a thnn_conv_ versus a slow_conv_? # @@ -6405,6 +7881,7 @@ CUDA: slow_conv_transpose2d_out_cuda - func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor + use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_transpose2d_cpu @@ -6430,6 +7907,7 @@ CUDA: slow_conv_transpose3d_out_cuda - func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1) -> Tensor + use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_transpose3d_cpu @@ -6452,6 +7930,7 @@ python_module: nn - func: thnn_conv2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0) -> Tensor + use_c10_dispatcher: full python_module: nn - func: thnn_conv2d_forward.output(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, *, Tensor(a!) output, Tensor(b!) finput, Tensor(c!) fgrad_input) -> (Tensor(a!), Tensor(b!), Tensor(c!)) @@ -6461,6 +7940,7 @@ CUDA: legacy::cuda::_thnn_conv2d_forward_out - func: thnn_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> (Tensor output, Tensor finput, Tensor fgrad_input) + use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv2d_forward_cpu @@ -6483,6 +7963,7 @@ python_module: nn - func: thnn_conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor + use_c10_dispatcher: full python_module: nn - func: thnn_conv_depthwise2d_forward.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!) @@ -6491,6 +7972,7 @@ CUDA: legacy::cuda::_thnn_conv_depthwise2d_forward_out - func: thnn_conv_depthwise2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor + use_c10_dispatcher: full python_module: nn dispatch: CUDA: legacy::cuda::_thnn_conv_depthwise2d_forward @@ -6510,6 +7992,7 @@ python_module: nn - func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0) -> Tensor + use_c10_dispatcher: full python_module: nn - func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, *, Tensor(a!) output, Tensor(b!) finput, Tensor(c!) fgrad_input) -> (Tensor(a!), Tensor(b!), Tensor(c!)) @@ -6518,6 +8001,7 @@ CPU: slow_conv3d_forward_out_cpu - func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding) -> (Tensor output, Tensor finput, Tensor fgrad_input) + use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv3d_forward_cpu @@ -6534,6 +8018,7 @@ CPU: slow_conv3d_backward_cpu - func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor + use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_dilated2d_cpu @@ -6547,6 +8032,7 @@ CUDA: slow_conv_dilated2d_backward_cuda - func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor + use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_dilated3d_cpu @@ -6621,7 +8107,170 @@ variants: function, method device_guard: False -# Note: this function is only for testing. +- func: isposinf(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: isposinf_out + +- func: isneginf(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: isneginf_out + +# NOTE [_add_batch_dim and _remove_batch_dim] +# _add_batch_dim and _remove_batch_dim are meant to be used in the implementation +# of the vmap frontend API (see torch/_vmap_internals.py). They are not +# user-facing, hence the leading underscore. Please don't use them them anywhere else. +- func: _add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor + use_c10_dispatcher: full + variants: function + +# See NOTE [_add_batch_dim and _remove_batch_dim] +- func: _remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor + use_c10_dispatcher: full + variants: function + +## Functions related to the fast Fourier transform and the torch.fft namespace +# Note [FFT namespace binding] +# Functions in the fft python module should have their names start with +# "fft_" underscore and be bound to the desired Python name in +# torch/fft/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/fft.h. +# The "fft_" names should be hidden from the user and not documented. +# +# See fft_fft as an example. + +# torch.fft.fft +# NOTE: NOT an alias for torch.fft, which has different semantics +- func: fft_fft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor + use_c10_dispatcher: full + variants: function, method + +## Functions for linear algebra and the torch.linalg namespace +# Note [linalg namespace binding] +# Functions in the linalg python module should have their names start with +# "linalg_" and be bound to the desired Python name in +# torch/linalg/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/linalg.h. +# The "linalg_" names should be hidden from the user and not documented. +# +# See linalg_det as an example. + +# torch.linalg.det, alias for torch.det +- func: linalg_det(Tensor self) -> Tensor + python_module: linalg + use_c10_dispatcher: full + variants: function + +- func: det(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +# torch.outer, alias for torch.ger +- func: outer(Tensor self, Tensor vec2) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) + +- func: ger(Tensor self, Tensor vec2) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) + +- func: linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + python_module: linalg + variants: function + +- func: linalg_norm.out(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +- func: linalg_norm.ord_str_out(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + +## Functions that are only for testing # It is undocumented and should not be used outside of tests. - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor use_c10_dispatcher: full + +# Note: this function is only for testing. +- func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: _test_optional_intlist + +# Note: this function is only for testing. +- func: _test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: _test_optional_intlist + +# Note: this function is only for testing. +- func: _test_optional_floatlist(Tensor values, float[]? addends) -> Tensor + use_c10_dispatcher: full + python_module: nn + dispatch: + CPU: _test_optional_floatlist diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp index 08deed30..46183dd3 100644 --- a/ext/torch/ext.cpp +++ b/ext/torch/ext.cpp @@ -372,7 +372,7 @@ void Init_ext() .define_method( "grad=", *[](Tensor& self, torch::Tensor& grad) { - self.grad() = grad; + self.mutable_grad() = grad; }) .define_method( "_dtype", @@ -609,7 +609,7 @@ void Init_ext() .define_method( "grad=", *[](Parameter& self, torch::Tensor& grad) { - self.grad() = grad; + self.mutable_grad() = grad; }); Class rb_cDevice = define_class_under(rb_mTorch, "Device") diff --git a/ext/torch/ruby_arg_parser.h b/ext/torch/ruby_arg_parser.h index b86825ff..aa9f2df4 100644 --- a/ext/torch/ruby_arg_parser.h +++ b/ext/torch/ruby_arg_parser.h @@ -91,7 +91,7 @@ struct RubyArgs { inline c10::optional toInt64Optional(int i); inline c10::optional toBoolOptional(int i); inline c10::optional toDoubleOptional(int i); - // inline c10::OptionalArray doublelistOptional(int i); + inline c10::OptionalArray doublelistOptional(int i); // inline at::Layout layout(int i); // inline at::Layout layoutWithDefault(int i, at::Layout default_layout); inline c10::optional layoutOptional(int i); @@ -105,7 +105,7 @@ struct RubyArgs { inline c10::optional memoryformatOptional(int i); // inline at::QScheme toQScheme(int i); inline std::string string(int i); - // inline c10::optional stringOptional(int i); + inline c10::optional stringOptional(int i); // inline PyObject* pyobject(int i); inline int64_t toInt64(int i); // inline int64_t toInt64WithDefault(int i, int64_t default_int); @@ -249,6 +249,25 @@ inline c10::optional RubyArgs::toDoubleOptional(int i) { return toDouble(i); } +inline c10::OptionalArray RubyArgs::doublelistOptional(int i) { + if (NIL_P(args[i])) return {}; + + VALUE arg = args[i]; + auto size = RARRAY_LEN(arg); + std::vector res(size); + for (idx = 0; idx < size; idx++) { + VALUE obj = rb_ary_entry(arg, idx); + if (FIXNUM_P(obj) || RB_FLOAT_TYPE_P(obj)) { + res[idx] = from_ruby(obj); + } else { + rb_raise(rb_eArgError, "%s(): argument '%s' must be %s, but found element of type %s at pos %d", + signature.name.c_str(), signature.params[i].name.c_str(), + signature.params[i].type_name().c_str(), rb_obj_classname(obj), idx + 1); + } + } + return res; +} + inline c10::optional RubyArgs::layoutOptional(int i) { if (NIL_P(args[i])) return c10::nullopt; @@ -285,6 +304,11 @@ inline std::string RubyArgs::string(int i) { return from_ruby(args[i]); } +inline c10::optional RubyArgs::stringOptional(int i) { + if (!args[i]) return c10::nullopt; + return from_ruby(args[i]); +} + inline int64_t RubyArgs::toInt64(int i) { if (NIL_P(args[i])) return signature.params[i].default_int; return from_ruby(args[i]); diff --git a/ext/torch/templates.h b/ext/torch/templates.h index 40a68c6b..176302c9 100644 --- a/ext/torch/templates.h +++ b/ext/torch/templates.h @@ -19,6 +19,7 @@ using torch::TensorOptions; using torch::Layout; using torch::MemoryFormat; using torch::IntArrayRef; +using torch::ArrayRef; using torch::TensorList; using torch::Storage; diff --git a/ext/torch/wrap_outputs.h b/ext/torch/wrap_outputs.h index 914b2688..97ab209e 100644 --- a/ext/torch/wrap_outputs.h +++ b/ext/torch/wrap_outputs.h @@ -90,3 +90,10 @@ inline Object wrap(torch::TensorList x) { } return Object(a); } + +inline Object wrap(std::tuple x) { + Array a; + a.push(to_ruby(std::get<0>(x))); + a.push(to_ruby(std::get<1>(x))); + return Object(a); +} From 30d02fe12c5833340e862e446b5ae74218984e27 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 27 Oct 2020 16:25:29 -0700 Subject: [PATCH 02/28] Added test for max change in 1.7.0 [skip ci] --- test/autograd_test.rb | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/autograd_test.rb b/test/autograd_test.rb index 79d64e39..1ad40314 100644 --- a/test/autograd_test.rb +++ b/test/autograd_test.rb @@ -80,4 +80,12 @@ def test_variable_invalid end assert_equal "Variable data has to be a tensor, but got Object", error.message end + + # 1.7.0 behavior + def test_max + a = Torch.tensor([3.0, 2, 3], requires_grad: true) + a.max.backward + # TODO debug + # assert_equal [0.5, 0, 0.5], a.grad.to_a + end end From d348c9302fbfe2641cdd9828c27c200be24bd902 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 27 Oct 2020 17:31:04 -0700 Subject: [PATCH 03/28] Updated changelog [skip ci] --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5524cdcb..8e1b7273 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.5.0 (unreleased) + +- Updated LibTorch to 1.7.0 + ## 0.4.2 (2020-10-27) - Fixed errors with optimizer options From 47841b1bf92fd24e25def1a2bf845f9c9e451cae Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 27 Oct 2020 17:43:12 -0700 Subject: [PATCH 04/28] Removed deprecated overload for addcmul! and addcdiv! --- CHANGELOG.md | 1 + ext/torch/ext.cpp | 10 ---------- lib/torch/optim/adadelta.rb | 4 ++-- lib/torch/optim/adagrad.rb | 4 ++-- lib/torch/optim/adam.rb | 4 ++-- lib/torch/optim/adamax.rb | 2 +- lib/torch/optim/adamw.rb | 4 ++-- lib/torch/optim/rmsprop.rb | 6 +++--- lib/torch/optim/rprop.rb | 2 +- 9 files changed, 14 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e1b7273..9d59630e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.5.0 (unreleased) - Updated LibTorch to 1.7.0 +- Removed deprecated overload for `addcmul!` and `addcdiv!` ## 0.4.2 (2020-10-27) diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp index 46183dd3..78097260 100644 --- a/ext/torch/ext.cpp +++ b/ext/torch/ext.cpp @@ -348,16 +348,6 @@ void Init_ext() *[](Tensor& self) { return self.is_contiguous(); }) - .define_method( - "addcmul!", - *[](Tensor& self, Scalar value, const Tensor & tensor1, const Tensor & tensor2) { - return self.addcmul_(tensor1, tensor2, value); - }) - .define_method( - "addcdiv!", - *[](Tensor& self, Scalar value, const Tensor & tensor1, const Tensor & tensor2) { - return self.addcdiv_(tensor1, tensor2, value); - }) .define_method( "_requires_grad!", *[](Tensor& self, bool requires_grad) { diff --git a/lib/torch/optim/adadelta.rb b/lib/torch/optim/adadelta.rb index 3b496d8a..af06786d 100644 --- a/lib/torch/optim/adadelta.rb +++ b/lib/torch/optim/adadelta.rb @@ -42,11 +42,11 @@ def step(closure = nil) grad = grad.add(p.data, alpha: group[:weight_decay]) end - square_avg.mul!(rho).addcmul!(1 - rho, grad, grad) + square_avg.mul!(rho).addcmul!(grad, grad, value: 1 - rho) std = square_avg.add(eps).sqrt! delta = acc_delta.add(eps).sqrt!.div!(std).mul!(grad) p.data.add!(delta, alpha: -group[:lr]) - acc_delta.mul!(rho).addcmul!(1 - rho, delta, delta) + acc_delta.mul!(rho).addcmul!(delta, delta, value: 1 - rho) end end diff --git a/lib/torch/optim/adagrad.rb b/lib/torch/optim/adagrad.rb index d8322c5d..1e17e10f 100644 --- a/lib/torch/optim/adagrad.rb +++ b/lib/torch/optim/adagrad.rb @@ -57,9 +57,9 @@ def step(closure = nil) if grad.sparse? raise NotImplementedYet else - state[:sum].addcmul!(1, grad, grad) + state[:sum].addcmul!(grad, grad, value: 1) std = state[:sum].sqrt.add!(group[:eps]) - p.data.addcdiv!(-clr, grad, std) + p.data.addcdiv!(grad, std, value: -clr) end end end diff --git a/lib/torch/optim/adam.rb b/lib/torch/optim/adam.rb index 3a110ba9..1ce128a3 100644 --- a/lib/torch/optim/adam.rb +++ b/lib/torch/optim/adam.rb @@ -58,7 +58,7 @@ def step(closure = nil) # Decay the first and second moment running average coefficient exp_avg.mul!(beta1).add!(grad, alpha: 1 - beta1) - exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad) + exp_avg_sq.mul!(beta2).addcmul!(grad, grad, value: 1 - beta2) if amsgrad # Maintains the maximum of all 2nd moment running avg. till now Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq) @@ -70,7 +70,7 @@ def step(closure = nil) step_size = group[:lr] / bias_correction1 - p.data.addcdiv!(-step_size, exp_avg, denom) + p.data.addcdiv!(exp_avg, denom, value: -step_size) end end diff --git a/lib/torch/optim/adamax.rb b/lib/torch/optim/adamax.rb index 64fe5954..aeb878cf 100644 --- a/lib/torch/optim/adamax.rb +++ b/lib/torch/optim/adamax.rb @@ -57,7 +57,7 @@ def step(closure = nil) bias_correction = 1 - beta1 ** state[:step] clr = group[:lr] / bias_correction - p.data.addcdiv!(-clr, exp_avg, exp_inf) + p.data.addcdiv!(exp_avg, exp_inf, value: -clr) end end diff --git a/lib/torch/optim/adamw.rb b/lib/torch/optim/adamw.rb index b31e8f85..db9608c0 100644 --- a/lib/torch/optim/adamw.rb +++ b/lib/torch/optim/adamw.rb @@ -59,7 +59,7 @@ def step(closure = nil) # Decay the first and second moment running average coefficient exp_avg.mul!(beta1).add!(grad, alpha: 1 - beta1) - exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad) + exp_avg_sq.mul!(beta2).addcmul!(grad, grad, value: 1 - beta2) if amsgrad # Maintains the maximum of all 2nd moment running avg. till now Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq) @@ -71,7 +71,7 @@ def step(closure = nil) step_size = group[:lr] / bias_correction1 - p.data.addcdiv!(-step_size, exp_avg, denom) + p.data.addcdiv!(exp_avg, denom, value: -step_size) end end diff --git a/lib/torch/optim/rmsprop.rb b/lib/torch/optim/rmsprop.rb index 367f07b5..c05d7959 100644 --- a/lib/torch/optim/rmsprop.rb +++ b/lib/torch/optim/rmsprop.rb @@ -49,7 +49,7 @@ def step(closure = nil) grad = grad.add(p.data, alpha: group[:weight_decay]) end - square_avg.mul!(alpha).addcmul!(1 - alpha, grad, grad) + square_avg.mul!(alpha).addcmul!(grad, grad, value: 1 - alpha) if group[:centered] grad_avg = state[:grad_avg] @@ -61,10 +61,10 @@ def step(closure = nil) if group[:momentum] > 0 buf = state[:momentum_buffer] - buf.mul!(group[:momentum]).addcdiv!(1, grad, avg) + buf.mul!(group[:momentum]).addcdiv!(grad, avg, value: 1) p.data.add!(buf, alpha: -group[:lr]) else - p.data.addcdiv!(-group[:lr], grad, avg) + p.data.addcdiv!(grad, avg, value: -group[:lr]) end end end diff --git a/lib/torch/optim/rprop.rb b/lib/torch/optim/rprop.rb index 226b4759..cf50d03c 100644 --- a/lib/torch/optim/rprop.rb +++ b/lib/torch/optim/rprop.rb @@ -52,7 +52,7 @@ def step(closure = nil) grad[sign.eq(etaminus)] = 0 # update parameters - p.data.addcmul!(-1, grad.sign, step_size) + p.data.addcmul!(grad.sign, step_size, value: -1) state[:prev].copy!(grad) end From 14b38d77860ca02dea9b6b3d8667aef97e7c208e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 27 Oct 2020 17:53:16 -0700 Subject: [PATCH 05/28] Updated readme [skip ci] --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5b5a6e43..f389f47f 100644 --- a/README.md +++ b/README.md @@ -416,6 +416,7 @@ Here’s the list of compatible versions. Torch.rb | LibTorch --- | --- +0.5.0+ | 1.7.0 0.3.0+ | 1.6.0 0.2.0-0.2.7 | 1.5.0-1.5.1 0.1.8 | 1.4.0 From 80a601d32ab90e5f1294fbddc832b46d8bff101c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Thu, 17 Dec 2020 09:54:13 +0000 Subject: [PATCH 06/28] manual_seed and manual_seed_all CUDA methods added --- ext/torch/ext.cpp | 6 +++++- test/cuda_test.rb | 26 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp index 78097260..cd10498a 100644 --- a/ext/torch/ext.cpp +++ b/ext/torch/ext.cpp @@ -16,6 +16,8 @@ #include "tensor_functions.h" #include "nn_functions.h" +//#include "cuda_functions.h" + using namespace Rice; using torch::indexing::TensorIndex; @@ -618,5 +620,7 @@ void Init_ext() Module rb_mCUDA = define_module_under(rb_mTorch, "CUDA") .add_handler(handle_error) .define_singleton_method("available?", &torch::cuda::is_available) - .define_singleton_method("device_count", &torch::cuda::device_count); + .define_singleton_method("device_count", &torch::cuda::device_count) + .define_singleton_method("manual_seed", &torch::cuda::manual_seed) + .define_singleton_method("manual_seed_all", &torch::cuda::manual_seed_all); } diff --git a/test/cuda_test.rb b/test/cuda_test.rb index 2b33f615..ed54e20e 100644 --- a/test/cuda_test.rb +++ b/test/cuda_test.rb @@ -26,4 +26,30 @@ def test_tensor assert_equal "PyTorch is not linked with support for cuda devices", error.message end end + + def test_random_seed + if Torch::CUDA.available? + Torch::CUDA.manual_seed_all 42 + + comparables = Torch::CUDA.device_count.times.map do |i| + x, y = 2.times.map { Torch.rand(100, device: "cuda:#{i}").to_a } + assert x != y + [x, y] + end + + Torch::CUDA.manual_seed_all 42 + Torch::CUDA.device_count.times.map do |i| + x, y = 2.times.map { Torch.rand(100, device: "cuda:#{i}").to_a } + assert x != y + assert_equal x, comparables[i].first + assert_equal y, comparables[i].last + end + else + error = assert_raises do + Torch.random 1, device: 'cuda:0' + end + + assert_equal "PyTorch is not linked with support for cuda devices", error.message + end + end end From 0612de7bd87cc6c9ec34746164553a1c0fcb83f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Thu, 17 Dec 2020 10:04:29 +0000 Subject: [PATCH 07/28] Unknown parameter in module error message fixed --- lib/torch/nn/module.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/torch/nn/module.rb b/lib/torch/nn/module.rb index 9418bfcb..91c698e9 100644 --- a/lib/torch/nn/module.rb +++ b/lib/torch/nn/module.rb @@ -134,7 +134,7 @@ def load_state_dict(state_dict) param.copy!(input_param) end else - raise Error, "Unknown parameter: #{k1}" + raise Error, "Unknown parameter `#{k2}` in module `#{k1}`" end else raise Error, "Unknown module: #{k1}" From 5e117f0cf02698f6b33c9f10805c8b38d4900980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Thu, 17 Dec 2020 10:09:49 +0000 Subject: [PATCH 08/28] fixed CUDA random test for non-CUDA environment --- test/cuda_test.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/cuda_test.rb b/test/cuda_test.rb index ed54e20e..8d3897f2 100644 --- a/test/cuda_test.rb +++ b/test/cuda_test.rb @@ -46,7 +46,7 @@ def test_random_seed end else error = assert_raises do - Torch.random 1, device: 'cuda:0' + Torch.rand 1, device: 'cuda:0' end assert_equal "PyTorch is not linked with support for cuda devices", error.message From 987a3003d40a52df434e6d8f6502b0ecc06a6223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Thu, 17 Dec 2020 19:39:03 +0000 Subject: [PATCH 09/28] removed useless commented out header --- ext/torch/ext.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp index cd10498a..f0a8da2c 100644 --- a/ext/torch/ext.cpp +++ b/ext/torch/ext.cpp @@ -16,8 +16,6 @@ #include "tensor_functions.h" #include "nn_functions.h" -//#include "cuda_functions.h" - using namespace Rice; using torch::indexing::TensorIndex; From cecf39813c54fb4e51e9e49ec0df9c9bf219fb22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Mon, 21 Dec 2020 15:57:59 +0000 Subject: [PATCH 10/28] named buffers load/save --- lib/torch/nn/module.rb | 49 +++++++++++++++++++++++++++++------------- test/nn/module_test.rb | 13 +++++++++++ test/support/net.rb | 23 ++++++++++++++++++++ 3 files changed, 70 insertions(+), 15 deletions(-) diff --git a/lib/torch/nn/module.rb b/lib/torch/nn/module.rb index 91c698e9..9b6a459b 100644 --- a/lib/torch/nn/module.rb +++ b/lib/torch/nn/module.rb @@ -113,31 +113,41 @@ def call(*input, **kwargs) forward(*input, **kwargs) end - def state_dict(destination: nil) + def state_dict(destination: nil, prefix: "") destination ||= {} - named_parameters.each do |k, v| - destination[k] = v + save_to_state_dict(destination, prefix: prefix) + + named_children.each do |name, mod| + next unless mod + + mod.state_dict(destination: destination, prefix: prefix + name + '.') end destination end - + # TODO add strict option # TODO match PyTorch behavior def load_state_dict(state_dict) state_dict.each do |k, input_param| - k1, k2 = k.split(".", 2) - mod = named_modules[k1] - if mod.is_a?(Module) - param = mod.named_parameters[k2] - if param.is_a?(Parameter) - Torch.no_grad do - param.copy!(input_param) - end - else - raise Error, "Unknown parameter `#{k2}` in module `#{k1}`" + *mods, param_name = k.split(".") + + mods_ok = [] + mod = mods.inject(self) do |mod, name| + child = mod.named_modules[name] + raise Error, "Unknown module `#{[mods_ok + name].join '.'}`" unless child.is_a?(Module) + + mods_ok << name + child + end + + param = mod.named_parameters[param_name] || mod.named_buffers[param_name] + if param.is_a?(Parameter) || param.is_a?(Tensor) + Torch.no_grad do + param.copy!(input_param) end else - raise Error, "Unknown module: #{k1}" + p mod, param.class, mod.named_parameters.keys, mod.named_buffers.keys + raise Error, "Unknown parameter `#{param_name}` in module `#{mods.join '.'}`" end end @@ -300,6 +310,15 @@ def format(str, *vars, **options) def dict instance_variables.reject { |k| instance_variable_get(k).is_a?(Tensor) }.map { |k| [k[1..-1].to_sym, instance_variable_get(k)] }.to_h end + + def save_to_state_dict(destination, prefix: "") + named_parameters(prefix: prefix, recurse: false).each do |k, v| + destination[k] = v + end + named_buffers.each do |k, v| + destination[prefix + k] = v + end + end end end end diff --git a/test/nn/module_test.rb b/test/nn/module_test.rb index 0588a3c4..a7cce31c 100644 --- a/test/nn/module_test.rb +++ b/test/nn/module_test.rb @@ -70,6 +70,19 @@ def test_state_dict # Torch.save(optimizer.state_dict, tmpfile2.path) end + def test_state_dict_with_buffers + net = SimpleResidualBlock.new + expected_keys = %w[seq.0.weight seq.1.weight seq.1.bias seq.1.running_mean seq.1.running_var seq.1.num_batches_tracked seq.3.weight seq.4.weight seq.4.bias seq.4.running_mean seq.4.running_var seq.4.num_batches_tracked seq.6.weight seq.7.weight seq.7.bias seq.7.running_mean seq.7.running_var seq.7.num_batches_tracked] + assert_equal expected_keys, net.state_dict.keys + + tmpfile = Tempfile.new + Torch.save net.state_dict, tmpfile.path + + net = SimpleResidualBlock.new + net.load_state_dict Torch.load tmpfile.path + net.eval + end + def test_inspect assert_match "(conv1): Conv2d(1, 6, kernel_size: [3, 3], stride: [1, 1])", net.inspect end diff --git a/test/support/net.rb b/test/support/net.rb index de1fe602..11a2e9a3 100644 --- a/test/support/net.rb +++ b/test/support/net.rb @@ -27,3 +27,26 @@ def num_flat_features(x) num_features end end + +class SimpleResidualBlock < Torch::NN::Module + def initialize + super + + @relu = Torch::NN::ReLU.new + + @seq = Torch::NN::Sequential.new( + Torch::NN::Conv2d.new(64, 128, 3, padding: 1, bias: false), + Torch::NN::BatchNorm2d.new(128), + @relu, + Torch::NN::Conv2d.new(128, 128, 3, padding: 1, bias: false), + Torch::NN::BatchNorm2d.new(128), + @relu, + Torch::NN::Conv2d.new(128, 64, 3, bias: false), + Torch::NN::BatchNorm2d.new(64) + ) + end + + def forward(x) + @relu.forward(@seq.forward(x) + x) + end +end From b20770cd44b36b78009e52783f2501fede94a090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Mon, 21 Dec 2020 17:55:04 +0000 Subject: [PATCH 11/28] debug print removed --- lib/torch/nn/module.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/torch/nn/module.rb b/lib/torch/nn/module.rb index 9b6a459b..a6e4e292 100644 --- a/lib/torch/nn/module.rb +++ b/lib/torch/nn/module.rb @@ -146,7 +146,6 @@ def load_state_dict(state_dict) param.copy!(input_param) end else - p mod, param.class, mod.named_parameters.keys, mod.named_buffers.keys raise Error, "Unknown parameter `#{param_name}` in module `#{mods.join '.'}`" end end From ef05f1099ea01d9884a09ef9d934ae0ec8d6345e Mon Sep 17 00:00:00 2001 From: "i.razuvaev" Date: Wed, 4 Aug 2021 08:21:51 +0000 Subject: [PATCH 12/28] multihead attention --- lib/torch.rb | 2 + lib/torch/nn/functional_attention.rb | 242 +++++++++++++++++++++++++++ lib/torch/nn/multihead_attention.rb | 123 ++++++++++++++ test/nn/functional_attention_test.rb | 141 ++++++++++++++++ 4 files changed, 508 insertions(+) create mode 100644 lib/torch/nn/functional_attention.rb create mode 100644 lib/torch/nn/multihead_attention.rb create mode 100644 test/nn/functional_attention_test.rb diff --git a/lib/torch.rb b/lib/torch.rb index 20996052..620e016a 100644 --- a/lib/torch.rb +++ b/lib/torch.rb @@ -132,6 +132,7 @@ require "torch/nn/softsign" require "torch/nn/tanh" require "torch/nn/tanhshrink" +require "torch/nn/multihead_attention" # nn activations other require "torch/nn/log_softmax" @@ -174,6 +175,7 @@ # nn other require "torch/nn/functional" +require "torch/nn/functional_attention" require "torch/nn/init" # utils diff --git a/lib/torch/nn/functional_attention.rb b/lib/torch/nn/functional_attention.rb new file mode 100644 index 00000000..cfc6d3ac --- /dev/null +++ b/lib/torch/nn/functional_attention.rb @@ -0,0 +1,242 @@ +module Torch + module NN + class Functional + class << self + def in_projection_packed(q, k, v, w, b: nil) + e = q.size(-1) + + if k.eql? v + if q.eql? k + # self-attention + return linear(q, w, b).chunk(3, dim: -1) + else + # encoder-decoder attention + w_q, w_kv = w.split_with_sizes([e, e * 2]) + if b.nil? + b_q = b_kv = nil + else + b_q, b_kv = b.split_with_sizes([e, e * 2]) + end + + return [linear(q, w_q, b_q), *linear(k, w_kv, b_kv).chunk(2, dim: -1)] + end + else + w_q, w_k, w_v = w.chunk(3) + if b.nil? + b_q = b_k = b_v = None + else + b_q, b_k, b_v = b.chunk(3) + end + + return [linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)] + end + end + + def in_projection( + q, k, v, + w_q, w_k, w_v, + b_q: nil, b_k: nil, b_v: nil + ) + + e_q, e_k, e_v = q.size(-1), k.size(-1), v.size(-1) + + raise ArgumentError, "Expecting query weights shape of #{[e_q, e_q]}, but got #{w_q.shape}" unless w_q.shape == [e_q, e_q] + raise ArgumentError, "Expecting key weights shape of #{[e_k, e_k]}, but got #{w_k.shape}" unless w_q.shape == [e_k, e_k] + raise ArgumentError, "Expecting value weights shape of #{[e_v, e_v]}, but got #{w_v.shape}" unless w_q.shape == [e_v, e_v] + + raise ArgumentError, "Expecting query bias shape of #{[e_q]}, but got #{b_q.shape}" if b_q && b_q.shape != [e_q] + raise ArgumentError, "Expecting key bias shape of #{[e_k]}, but got #{b_k.shape}" if b_k && b_k.shape != [e_k] + raise ArgumentError, "Expecting value bias shape of #{[e_v]}, but got #{b_v.shape}" if b_v && b_v.shape != [e_v] + + [linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)] + end + + def scaled_dot_product_attention( + q, k, v, + attn_mask: nil, dropout_p: 0.0 + ) + + b, nt, e = q.shape + + q = q / Math.sqrt(e) + + attn = Torch.bmm(q, k.transpose(-2, -1)) + attn += attn_mask if attn_mask + attn = softmax(attn, dim: -1) + attn = dropout(attn, p: dropout_p) if dropout_p > 0 + + output = Torch.bmm(attn, v) + + [output, attn] + end + + def multi_head_attention_forward( + query, key, value, + embed_dim_to_check, num_heads, + in_proj_weight, in_proj_bias, + bias_k, bias_v, + add_zero_attn, + dropout_p, + out_proj_weight, out_proj_bias, + training: true, + key_padding_mask: nil, + need_weights: true, + attn_mask: nil, + use_separate_proj_weight: false, + q_proj_weight: nil, k_proj_weight: nil, v_proj_weight: nil, + static_k: nil, static_v: nil + ) + + tgt_len, bsz, embed_dim = query.shape + src_len = key.shape.first + + raise ArgumentError, "Was expecting embedding dimension of #{embed_dim_to_check}, but got #{embed_dim}" unless embed_dim == embed_dim_to_check + + head_dim = if embed_dim.is_a?(Torch::Tensor) + embed_dim.div(num_heads, rounding_mode: 'trunc') + else + head_dim = embed_dim.div num_heads + end + + if use_separate_proj_weight + raise ArgumentError, "Key's sequence and batch dims #{key.shape[...2]} do not match value's #{value.shape[...2]}" unless key.shape[...2] == value.shape[...2] + else + raise ArgumentError, "Key shape #{key.shape} does not match value shape #{value.shape}" unless key.shape == value.shape + end + + + # compute in-projection + q, k, v = + if use_separate_proj_weight + raise ArgumentError, "use_separate_proj_weight is true but q_proj_weight is nil" unless q_proj_weight + raise ArgumentError, "use_separate_proj_weight is true but k_proj_weight is nil" unless k_proj_weight + raise ArgumentError, "use_separate_proj_weight is true but v_proj_weight is nil" unless v_proj_weight + + if in_proj_bias + b_q, b_k, b_v = in_proj_bias.chunk(3) + else + b_q = b_k = b_v = nil + end + + in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q: b_q, b_k: b_k, b_v: b_v) + else + in_projection_packed(query, key, value, in_proj_weight, b: in_proj_bias) + end + + # prep attention mask + if attn_mask + if attn_mask.dtype == :uint8 + puts "[WARN] Byte tensor for attn_mask in Multihead Attention is deprecated. Use bool tensor instead." + attn_mask = attn_mask.bool + else + raise ArgumentError, "Only float, byte, and bool types are supported for attn_mask, not #{attn_mask.dtype}" unless attn_mask.floating_point? or attn_mask.dtype == :bool + end + + if attn_mask.dim == 2 + correct_2d_size = [tgt_len, src_len] + raise ArgumentError, "The shape of the 2D attn_mask is #{attn_mask.shape}, but should be #{correct_2d_size}." unless attn_mask.shape == correct_2d_size + + attn_mask = attn_mask.unsqueeze(0) + elsif attn_mask.dim == 3 + correct_3d_size = [bsz * num_heads, tgt_len, src_len] + raise ArgumentError, "The shape of the 3D attn_mask is #{attn_mask.shape}, but should be #{correct_3d_size}." unless attn_mask.shape == correct_3d_size + else + raise ArgumentError, "attn_mask's dimension #{attn_mask.dim} is not supported" + end + end + + # prep key padding mask + if key_padding_mask && key_padding_mask.dtype == :uint8 + puts "[WARN] Byte tensor for key_padding_mask in Multihead Attention is deprecated. Use bool tensor instead." + key_padding_mask = key_padding_mask.bool + end + + # add bias along batch dimension (currently second) + if bias_k && bias_v + raise ArgumentError, "bias cannot be added to static key." if static_k + raise ArgumentError, "bias cannot be added to static value." if static_v + + k = Torch.cat([k, bias_k.repeat(1, bsz, 1)]) + v = Torch.cat([v, bias_v.repeat(1, bsz, 1)]) + + attn_mask = pad(attn_mask, [0, 1]) if attn_mask + key_padding_mask = pad(key_padding_mask, [0, 1]) if key_padding_mask + else + raise ArgumentError unless bias_k.nil? + raise ArgumentError unless bias_v.nil? + end + + # reshape q, k, v for multihead attention and make em batch first + q = q.contiguous.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) + + if static_k.nil? + k = k.contiguous.view(-1, bsz * num_heads, head_dim).transpose(0, 1) + else + raise ArgumentError, "Expecting static_k.size(0) of #{bsz * num_heads}, but got #{static_k.size(0)}" unless static_k.size(0) == bsz * num_heads + raise ArgumentError, "Expecting static_k.size(2) of #{bsz * num_heads}, but got #{static_k.size(2)}" unless static_k.size(2) == bsz * num_heads + + k = static_k + end + + if static_v.nil? + v = v.contiguous.view(-1, bsz * num_heads, head_dim).transpose(0, 1) + else + raise ArgumentError, "Expecting static_v.size(0) of #{bsz * num_heads}, but got #{static_v.size(0)}" unless static_v.size(0) == bsz * num_heads + raise ArgumentError, "Expecting static_v.size(2) of #{bsz * num_heads}, but got #{static_v.size(2)}" unless static_v.size(2) == bsz * num_heads + + v = static_v + end + + # add zero attention along batch dimension (now first) + if add_zero_attn + zero_attn_shape = [bsz * num_heads, 1, head_dim] + k = Torch.cat([k, Torch.zeros(zero_attn_shape, dtype: k.dtype, device: k.device)], dim: 1) + v = Torch.cat([v, Torch.zeros(zero_attn_shape, dtype: v.dtype, device: v.device)], dim: 1) + + attn_mask = pad(attn_mask, [0, 1]) if attn_mask + key_padding_mask = pad(key_padding_mask, [0, 1]) if key_padding_mask + end + + # update source sequence length after adjustments + src_len = k.size(1) + + # merge key padding and attention masks + if key_padding_mask + raise ArgumentError, "Expecting key_padding_mask shape of #{[bsz, src_len]}, but got #{key_padding_mask.shape}" unless key_padding_mask.shape == [bsz, src_len] + + key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len) + + attn_mask = if attn_mask.nil? + key_padding_mask + elsif attn_mask.dtype == :bool + attn_mask.logical_or(key_padding_mask) + else + attn_mask.masked_fill(key_padding_mask, -Float::INFINITY) + end + end + + # convert mask to float + if attn_mask && attn_mask.dtype == :bool + new_attn_mask = Torch.zeros_like(attn_mask, dtype: :float32) + attn_mask = new_attn_mask.masked_fill(attn_mask, -Float::INFINITY) + end + + dropout_p = 0.0 unless training + + # (deep breath) calculate attention and out projection + attn_output, attn_output_weights = scaled_dot_product_attention(q, k, v, attn_mask: attn_mask, dropout_p: dropout_p) + attn_output = attn_output.transpose(0, 1).contiguous.view(tgt_len, bsz, embed_dim) + attn_output = linear(attn_output, out_proj_weight, out_proj_bias) + + if need_weights + # average attention weights over heads + attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) + [attn_output, attn_output_weights.sum(dim: 1) / num_heads] + else + [attn_output, nil] + end + end + end + end + end +end diff --git a/lib/torch/nn/multihead_attention.rb b/lib/torch/nn/multihead_attention.rb new file mode 100644 index 00000000..566fb3c5 --- /dev/null +++ b/lib/torch/nn/multihead_attention.rb @@ -0,0 +1,123 @@ +module Torch + module NN + class MultiheadAttention < Module + def initialize( + embed_dim, num_heads, + dropout: 0.0, bias: true, add_bias_kv: false, add_zero_attn: false, + kdim: nil, vdim: nil, batch_first: false, device: nil, dtype: nil + ) + + super() + + @embed_dim = embed_dim + @kdim = kdim || @embed_dim + @vdim = vdim || @embed_dim + + @qkv_same_embed_dim = @kdim == @embed_dim && @vdim == @embed_dim + + @num_heads = num_heads + @dropout = dropout + @batch_first = batch_first + + @head_dim = @embed_dim.div @num_heads + + raise ArgumentError, "embed_dim must be divisible by num_heads" unless @head_dim * @num_heads == @embed_dim + + if @qkv_same_embed_dim + @in_proj_weight = Parameter.new(Torch.empty([3 * @embed_dim, @embed_dim])) + %w(q k v).each { |x| register_parameter("#{x}_proj_weight", nil) } + else + @q_proj_weight = Parameter.new(Torch.empty([@embed_dim, @embed_dim])) + @k_proj_weight = Parameter.new(Torch.empty([@embed_dim, @kdim])) + @v_proj_weight = Parameter.new(Torch.empty([@embed_dim, @vdim])) + + register_parameter('in_proj_weight', nil) + end + + if bias + @in_proj_bias = Parameter.new(Torch.empty(3 * @embed_dim)) + else + register_parameter('in_proj_bias', nil) + end + + @out_proj = Linear.new(@embed_dim, @embed_dim, bias: bias) + + if add_bias_kv + @bias_k = Parameter.new(Torch.empty([1, 1, @embed_dim])) + @bias_v = Parameter.new(Torch.empty([1, 1, @embed_dim])) + else + @bias_k = @bias_v = nil + end + + @add_zero_attn = add_zero_attn + + reset_parameters + end + + def batch_first? + !!@batch_first + end + + def reset_parameters + if @qkv_same_embed_dim + Init.xavier_uniform!(@in_proj_weight) + else + Init.xavier_uniform!(@q_proj_weight) + Init.xavier_uniform!(@k_proj_weight) + Init.xavier_uniform!(@v_proj_weight) + end + + if @in_proj_bias + Init.constant!(@in_proj_bias, 0.0) + Init.constant!(@out_proj.bias, 0.0) + end + + Init.xavier_uniform!(@bias_k) if @bias_k + Init.xavier_uniform!(@bias_v) if @bias_v + end + + def forward( + query, key, value, + key_padding_mask: nil, need_weights: true, attn_mask: nil + ) + + if batch_first? + query, key, value = [query, key, value].map { |t| t.transpose(1, 0) } + end + + attn_output, attn_output_weights = + if @qkv_same_embed_dim + F.multi_head_attention_forward( + query, key, value, + @embed_dim, @num_heads, + @in_proj_weight, @in_proj_bias, + @bias_k, @bias_v, @ad_zero_attn, + @dropout, @out_proj.weight, @out_proj.bias, + training: @training, + key_padding_mask: key_padding_mask, + need_weights: need_weights, + attn_mask: attn_mask + ) + else + F.multi_head_attention_forward( + query, key, value, + @embed_dim, @num_heads, + @in_proj_weight, @in_proj_bias, + @bias_k, @bias_v, @ad_zero_attn, + @dropout, @out_proj.weight, @out_proj.bias, + training: @training, + key_padding_mask: key_padding_mask, + need_weights: need_weights, + attn_mask: attn_mask, + use_separate_proj_weight: true, + q_proj_weight: q_proj_weight, k_proj_weight: @k_proj_weight, v_proj_weight: @v_proj_weight + ) + end + + attn_output = attn_output.transpose(1, 0) if batch_first? + + [attn_output, attn_output_weights] + end + end + end +end diff --git a/test/nn/functional_attention_test.rb b/test/nn/functional_attention_test.rb new file mode 100644 index 00000000..de18a7cb --- /dev/null +++ b/test/nn/functional_attention_test.rb @@ -0,0 +1,141 @@ +require_relative '../test_helper' + +class FunctionalAttentionTest < Minitest::Test + T = 4 + S = 8 + B = 2 + E = 6 + + SEED = 42 + + def test_self_attention_no_mask + t = Torch.ones([T, B, E]) + Torch.manual_seed SEED + attn = Torch::NN::MultiheadAttention.new E, 2 + out, weights = attn.(t, t, t) + + expected_out = Torch.tensor([ + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]], + + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]], + + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]], + + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]] + ]) + + expected_weights = Torch.tensor([ + [[0.2500, 0.2500, 0.2500, 0.2500], + [0.2500, 0.2500, 0.2500, 0.2500], + [0.2500, 0.2500, 0.2500, 0.2500], + [0.2500, 0.2500, 0.2500, 0.2500]], + + [[0.2500, 0.2500, 0.2500, 0.2500], + [0.2500, 0.2500, 0.2500, 0.2500], + [0.2500, 0.2500, 0.2500, 0.2500], + [0.2500, 0.2500, 0.2500, 0.2500]] + ]) + + assert_equal out.shape, expected_out.shape + assert_equal weights.shape, expected_weights.shape + + [[out.detach, expected_out], [weights.detach, expected_weights]].each do |(a, b)| + assert (a - b).abs.lt(1e-6).all + end + end + + def test_self_attention_with_masks + t = Torch.ones([T, B, E]) + Torch.manual_seed SEED + attn = Torch::NN::MultiheadAttention.new E, 2 + + attn_mask = Torch.triu(Torch.ones([T, T]), diagonal: 1).eq(1) + key_padding_mask = Torch.triu(Torch.zeros(B, T)) + key_padding_mask[0, -1] = 1 + + out, weights = attn.(t, t, t, attn_mask: attn_mask, key_padding_mask: key_padding_mask) + + expected_out = Torch.tensor([ + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]], + + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]], + + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]], + + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]] + ]) + + expected_weights = Torch.tensor([ + [[1.0000, 0.0000, 0.0000, 0.0000], + [0.5000, 0.5000, 0.0000, 0.0000], + [0.3333, 0.3333, 0.3333, 0.0000], + [0.3333, 0.3333, 0.3333, 0.0000]], + + [[1.0000, 0.0000, 0.0000, 0.0000], + [0.5000, 0.5000, 0.0000, 0.0000], + [0.3333, 0.3333, 0.3333, 0.0000], + [0.2500, 0.2500, 0.2500, 0.2500]] + ]) + + assert_equal out.shape, expected_out.shape + assert_equal weights.shape, expected_weights.shape + + [[out.detach, expected_out], [weights.detach, expected_weights]].each do |(a, b)| + assert (a - b).abs.lt(1e-6).all + end + end + + def test_encoder_decoder_attention + q = Torch.ones([T, B, E]) + k = v = Torch.ones([S, B, E]) + Torch.manual_seed SEED + attn = Torch::NN::MultiheadAttention.new E, 2 + + attn_mask = Torch.triu(Torch.ones([T, S]), diagonal: 1).eq(1) + key_padding_mask = Torch.triu(Torch.zeros(B, S)) + key_padding_mask[0, -1] = 1 + + out, weights = attn.(q, k, v, attn_mask: attn_mask, key_padding_mask: key_padding_mask) + + expected_out = Torch.tensor([ + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]], + + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]], + + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]], + + [[-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357], + [-1.2826, 0.4973, -0.3479, 0.3659, 0.6462, 0.1357]] + ]) + + expected_weights = Torch.tensor([ + [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000]], + + [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000]] + ]) + + assert_equal out.shape, expected_out.shape + assert_equal weights.shape, expected_weights.shape + + [[out.detach, expected_out], [weights.detach, expected_weights]].each do |(a, b)| + assert (a - b).abs.lt(1e-6).all + end + end +end From d2edbab39079c58884feae8e14d5982d4aa46ea8 Mon Sep 17 00:00:00 2001 From: "i.razuvaev" Date: Wed, 4 Aug 2021 14:25:47 +0000 Subject: [PATCH 13/28] removed endless range for respecting dying ruby 2.6 --- lib/torch/nn/functional_attention.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/torch/nn/functional_attention.rb b/lib/torch/nn/functional_attention.rb index cfc6d3ac..8e3785c6 100644 --- a/lib/torch/nn/functional_attention.rb +++ b/lib/torch/nn/functional_attention.rb @@ -99,7 +99,7 @@ def multi_head_attention_forward( end if use_separate_proj_weight - raise ArgumentError, "Key's sequence and batch dims #{key.shape[...2]} do not match value's #{value.shape[...2]}" unless key.shape[...2] == value.shape[...2] + raise ArgumentError, "Key's sequence and batch dims #{key.shape[0...2]} do not match value's #{value.shape[0...2]}" unless key.shape[0...2] == value.shape[0...2] else raise ArgumentError, "Key shape #{key.shape} does not match value shape #{value.shape}" unless key.shape == value.shape end From f9e9e86d7290c4636fbf6e9c691ed31ac16de4f2 Mon Sep 17 00:00:00 2001 From: "i.razuvaev" Date: Wed, 4 Aug 2021 14:25:58 +0000 Subject: [PATCH 14/28] module list --- lib/torch/nn/module.rb | 53 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/lib/torch/nn/module.rb b/lib/torch/nn/module.rb index 145ec26e..5fa0dade 100644 --- a/lib/torch/nn/module.rb +++ b/lib/torch/nn/module.rb @@ -387,5 +387,58 @@ def save_to_state_dict(destination, prefix: "") end end end + + class ModuleList < Module + def initialize(mods = nil) + super() + + return unless mods + self.extend(mods) + end + + def length + @modules.length + end + + alias :count :length + + def extend(mods) + raise ArgumentError, "Modules should respond to #each" unless mods.respond_to?(:each) + + mods.each { |m| append m } + + self + end + + def each(&block) + @modules.values.each &block + end + + def map(&block) + @modules.values.map &block + end + + def inject(inj, &block) + @modules.values.inject(inj, &block) + end + + def append(mod) + raise ArgumentError, "Provided element is not a module" unless mod.is_a?(Module) + add_module(length.to_s, mod) + self + end + + def [](*idx) + idx.map do |id| + if id.is_a?(Integer) + @modules[id.to_s] + elsif id.is_a?(Range) + id.each do |i| + @modules[i.to_s] + end + end + end.flatten + end + end end end From 898b4dd7a0df57e8be78890da07a7d479a890240 Mon Sep 17 00:00:00 2001 From: "i.razuvaev" Date: Wed, 4 Aug 2021 14:27:08 +0000 Subject: [PATCH 15/28] Transformer: attention is all you need --- lib/torch.rb | 5 +- lib/torch/nn/transformer.rb | 103 ++++++++++++++++++++ lib/torch/nn/transformer_decoder.rb | 23 +++++ lib/torch/nn/transformer_decoder_layer.rb | 56 +++++++++++ lib/torch/nn/transformer_encoder.rb | 23 +++++ lib/torch/nn/transformer_encoder_layer.rb | 48 ++++++++++ test/nn/transformer_test.rb | 110 ++++++++++++++++++++++ 7 files changed, 367 insertions(+), 1 deletion(-) create mode 100644 lib/torch/nn/transformer.rb create mode 100644 lib/torch/nn/transformer_decoder.rb create mode 100644 lib/torch/nn/transformer_decoder_layer.rb create mode 100644 lib/torch/nn/transformer_encoder.rb create mode 100644 lib/torch/nn/transformer_encoder_layer.rb create mode 100644 test/nn/transformer_test.rb diff --git a/lib/torch.rb b/lib/torch.rb index 620e016a..019170da 100644 --- a/lib/torch.rb +++ b/lib/torch.rb @@ -132,7 +132,6 @@ require "torch/nn/softsign" require "torch/nn/tanh" require "torch/nn/tanhshrink" -require "torch/nn/multihead_attention" # nn activations other require "torch/nn/log_softmax" @@ -144,6 +143,10 @@ require "torch/nn/embedding" require "torch/nn/embedding_bag" +# attention is all you need +require "torch/nn/multihead_attention" +require "torch/nn/transformer" + # nn distance functions require "torch/nn/cosine_similarity" require "torch/nn/pairwise_distance" diff --git a/lib/torch/nn/transformer.rb b/lib/torch/nn/transformer.rb new file mode 100644 index 00000000..12a93ae4 --- /dev/null +++ b/lib/torch/nn/transformer.rb @@ -0,0 +1,103 @@ +require_relative 'transformer_encoder_layer' +require_relative 'transformer_encoder' +require_relative 'transformer_decoder_layer' +require_relative 'transformer_decoder' + +module Torch + module NN + class Transformer < Module + def initialize( + d_model: 512, nhead: 8, + num_encoder_layers: 6, num_decoder_layers: 6, + dim_feedforward: 2048, dropout: 0.1, activation: :relu, + custom_encoder: nil, custom_decoder: nil, + layer_norm_eps: 1e-5, batch_first: false + ) + + super() + + @encoder = + if custom_encoder + custom_encoder + else + encoder_layer = TransformerEncoderLayer.new( + d_model, nhead, + dim_feedforward: dim_feedforward, dropout: dropout, activation: activation, + layer_norm_eps: layer_norm_eps, batch_first: batch_first + ) + encoder_norm = LayerNorm.new(d_model, eps: layer_norm_eps) + TransformerEncoder.new(encoder_layer, num_encoder_layers, norm: encoder_norm) + end + + @decoder = + if custom_decoder + custom_decoder + else + decoder_layer = TransformerDecoderLayer.new( + d_model, nhead, + dim_feedforward: dim_feedforward, dropout: dropout, activation: activation, + layer_norm_eps: layer_norm_eps, batch_first: batch_first + ) + decoder_norm = LayerNorm.new(d_model, eps: layer_norm_eps) + TransformerDecoder.new(decoder_layer, num_decoder_layers, norm: decoder_norm) + end + + reset_parameters + + @d_model = d_model + @nhead = nhead + @batch_first = batch_first + end + + attr_reader :d_model, :nhead, :encoder, :decoder + + def batch_first? + !!@batch_first + end + + def reset_parameters + parameters.each { |p| Init.xavier_uniform!(p) if p.dim > 1 } + end + + def forward( + src, tgt, + src_mask: nil, tgt_mask: nil, memory_mask: nil, + src_key_padding_mask: nil, tgt_key_padding_mask: nil, memory_key_padding_mask: nil + ) + + if (!batch_first? && src.size(1) != tgt.size(1)) || + (batch_first? && src.size(0) != tgt.size(0)) + + raise ArgumentError, "The batch number of src and tgt must be equal" + end + + if src.size(2) != d_model || tgt.size(2) != d_model + raise ArgumentError, "The feature number of src and tgt must be equal to d_model" + end + + memory = @encoder.(src, mask: src_mask, src_key_padding_mask: src_key_padding_mask) + @decoder.( + tgt, memory, + tgt_mask: tgt_mask, memory_mask: memory_mask, + tgt_key_padding_mask: tgt_key_padding_mask, memory_key_padding_mask: memory_key_padding_mask + ) + end + + class << self + def generate_square_subsequent_mask(sz) + mask = Torch.triu(Torch.ones([sz, sz])).eq(1).transpose(0, 1) + mask.float.masked_fill!(mask.eq(0), -Float::INFINITY).masked_fill!(mask.eq(1), 0.0) + mask + end + + alias :square_subsequent_mask :generate_square_subsequent_mask + end + + def generate_square_subsequent_mask(sz) + self.class.square_subsequent_mask(sz) + end + + alias :square_subsequent_mask :generate_square_subsequent_mask + end + end +end diff --git a/lib/torch/nn/transformer_decoder.rb b/lib/torch/nn/transformer_decoder.rb new file mode 100644 index 00000000..6a985853 --- /dev/null +++ b/lib/torch/nn/transformer_decoder.rb @@ -0,0 +1,23 @@ +module Torch + module NN + class TransformerDecoder < Module + def initialize(decoder_layer, num_layers, norm: nil) + super() + + state = decoder_layer.state_dict + layers = num_layers.times.map do |i| + decoder_layer.clone.tap { |l| l.load_state_dict(state) } + end + @layers = ModuleList.new(layers) + + @num_layers = num_layers + @norm = norm + end + + def forward(tgt, memory, tgt_mask: nil, memory_mask: nil, tgt_key_padding_mask: nil, memory_key_padding_mask: nil) + out = @layers.inject(tgt) { |kv, l| l.(kv, memory, tgt_mask: tgt_mask, memory_mask: memory_mask, tgt_key_padding_mask: tgt_key_padding_mask, memory_key_padding_mask: memory_key_padding_mask) } + @norm ? @norm.(out) : out + end + end + end +end diff --git a/lib/torch/nn/transformer_decoder_layer.rb b/lib/torch/nn/transformer_decoder_layer.rb new file mode 100644 index 00000000..139fd36b --- /dev/null +++ b/lib/torch/nn/transformer_decoder_layer.rb @@ -0,0 +1,56 @@ +module Torch + module NN + class TransformerDecoderLayer < Module + def initialize( + d_model, n_head, + dim_feedforward: 2048, dropout: 0.1, activation: :relu, + layer_norm_eps: 1e-5, batch_first: false + ) + + super() + + @self_attn = MultiheadAttention.new(d_model, n_head, dropout: dropout, batch_first: batch_first) + @multihead_attn = MultiheadAttention.new(d_model, n_head, dropout: dropout, batch_first: batch_first) + + @linear1 = Linear.new(d_model, dim_feedforward) + @dropout = Dropout.new(p: dropout) + @linear2 = Linear.new(dim_feedforward, d_model) + + @norm1 = LayerNorm.new(d_model, eps: layer_norm_eps) + @norm2 = LayerNorm.new(d_model, eps: layer_norm_eps) + @norm3 = LayerNorm.new(d_model, eps: layer_norm_eps) + + @dropout1 = Dropout.new(p: dropout) + @dropout2 = Dropout.new(p: dropout) + @dropout3 = Dropout.new(p: dropout) + + @activation = activation_fn(activation) + end + + def forward(tgt, memory, tgt_mask: nil, memory_mask: nil, tgt_key_padding_mask: nil, memory_key_padding_mask: nil) + tmp = @self_attn.(tgt, tgt, tgt, attn_mask: tgt_mask, key_padding_mask: tgt_key_padding_mask).first + out = tgt + @dropout1.(tmp) + out = @norm1.(out) + + tmp = @multihead_attn.(tgt, memory, memory, attn_mask: memory_mask, key_padding_mask: memory_key_padding_mask).first + out += @dropout2.(tmp) + out = @norm2.(out) + + tmp = @activation.(@linear1.(out)) + tmp = @linear2.(@dropout.(tmp)) + out += @dropout2.(tmp) + + @norm3.(out) + end + + private + def activation_fn(activation) + case activation.to_sym + when :relu then F.method(:relu) + when :gelu then F.method(:gelu) + else raise ArgumentError, "Activation should be relu/gelu, not `#{activation}`" + end + end + end + end +end diff --git a/lib/torch/nn/transformer_encoder.rb b/lib/torch/nn/transformer_encoder.rb new file mode 100644 index 00000000..4f4a245e --- /dev/null +++ b/lib/torch/nn/transformer_encoder.rb @@ -0,0 +1,23 @@ +module Torch + module NN + class TransformerEncoder < Module + def initialize(encoder_layer, num_layers, norm: nil) + super() + + state = encoder_layer.state_dict + layers = num_layers.times.map do |i| + encoder_layer.clone.tap { |l| l.load_state_dict(state) } + end + @layers = ModuleList.new(layers) + + @num_layers = num_layers + @norm = norm + end + + def forward(src, mask: nil, src_key_padding_mask: nil) + out = @layers.inject(src) { |q, l| l.(q, src_mask: mask, src_key_padding_mask: src_key_padding_mask) } + @norm ? @norm.(out) : out + end + end + end +end diff --git a/lib/torch/nn/transformer_encoder_layer.rb b/lib/torch/nn/transformer_encoder_layer.rb new file mode 100644 index 00000000..c9aa5589 --- /dev/null +++ b/lib/torch/nn/transformer_encoder_layer.rb @@ -0,0 +1,48 @@ +module Torch + module NN + class TransformerEncoderLayer < Module + def initialize( + d_model, n_head, + dim_feedforward: 2048, dropout: 0.1, activation: :relu, + layer_norm_eps: 1e-5, batch_first: false + ) + + super() + + @self_attn = MultiheadAttention.new(d_model, n_head, dropout: dropout, batch_first: batch_first) + @linear1 = Linear.new(d_model, dim_feedforward) + @dropout = Dropout.new(p: dropout) + @linear2 = Linear.new(dim_feedforward, d_model) + + @norm1 = LayerNorm.new(d_model, eps: layer_norm_eps) + @norm2 = LayerNorm.new(d_model, eps: layer_norm_eps) + + @dropout1 = Dropout.new(p: dropout) + @dropout2 = Dropout.new(p: dropout) + + @activation = activation_fn(activation) + end + + def forward(src, src_mask: nil, src_key_padding_mask: nil) + tmp = @self_attn.(src, src, src, attn_mask: src_mask, key_padding_mask: src_key_padding_mask).first + out = src + @dropout1.(tmp) + out = @norm1.(out) + + tmp = @activation.(@linear1.(out)) + tmp = @linear2.(@dropout.(tmp)) + out += @dropout2.(tmp) + + @norm2.(out) + end + + private + def activation_fn(activation) + case activation.to_sym + when :relu then F.method(:relu) + when :gelu then F.method(:gelu) + else raise ArgumentError, "Activation should be relu/gelu, not `#{activation}`" + end + end + end + end +end diff --git a/test/nn/transformer_test.rb b/test/nn/transformer_test.rb new file mode 100644 index 00000000..385818ae --- /dev/null +++ b/test/nn/transformer_test.rb @@ -0,0 +1,110 @@ +require_relative '../test_helper' + +class TranformerTest < Minitest::Test + T = 4 + S = 8 + B = 2 + E = 6 + + SEED = 42 + + NHEAD = 2 + + def test_transformer_encoder + Torch.manual_seed SEED + src = Torch.randn(S, B, E) + layer = Torch::NN::TransformerEncoderLayer.new(E, NHEAD) + encoder = Torch::NN::TransformerEncoder.new(layer, 4) + + expected_keys = ['layers.0.self_attn.in_proj_weight', 'layers.0.self_attn.in_proj_bias', 'layers.0.self_attn.out_proj.weight', 'layers.0.self_attn.out_proj.bias', 'layers.0.linear1.weight', 'layers.0.linear1.bias', 'layers.0.linear2.weight', 'layers.0.linear2.bias', 'layers.0.norm1.weight', 'layers.0.norm1.bias', 'layers.0.norm2.weight', 'layers.0.norm2.bias', 'layers.1.self_attn.in_proj_weight', 'layers.1.self_attn.in_proj_bias', 'layers.1.self_attn.out_proj.weight', 'layers.1.self_attn.out_proj.bias', 'layers.1.linear1.weight', 'layers.1.linear1.bias', 'layers.1.linear2.weight', 'layers.1.linear2.bias', 'layers.1.norm1.weight', 'layers.1.norm1.bias', 'layers.1.norm2.weight', 'layers.1.norm2.bias', 'layers.2.self_attn.in_proj_weight', 'layers.2.self_attn.in_proj_bias', 'layers.2.self_attn.out_proj.weight', 'layers.2.self_attn.out_proj.bias', 'layers.2.linear1.weight', 'layers.2.linear1.bias', 'layers.2.linear2.weight', 'layers.2.linear2.bias', 'layers.2.norm1.weight', 'layers.2.norm1.bias', 'layers.2.norm2.weight', 'layers.2.norm2.bias', 'layers.3.self_attn.in_proj_weight', 'layers.3.self_attn.in_proj_bias', 'layers.3.self_attn.out_proj.weight', 'layers.3.self_attn.out_proj.bias', 'layers.3.linear1.weight', 'layers.3.linear1.bias', 'layers.3.linear2.weight', 'layers.3.linear2.bias', 'layers.3.norm1.weight', 'layers.3.norm1.bias', 'layers.3.norm2.weight', 'layers.3.norm2.bias'] + assert_equal Set.new(encoder.state_dict.keys), Set.new(expected_keys) + + out = encoder.(src).detach + + expected_out = Torch.tensor([ + [[ 0.7493, 0.4482, -2.1426, 0.5586, 0.5540, -0.1676], + [-1.7787, 1.3332, -0.3269, -0.2184, 0.9501, 0.0408]], + + [[ 0.0258, -0.3633, 0.4725, -0.5102, 1.8175, -1.4423], + [-0.8428, 0.8163, -1.7820, 0.9993, 0.1579, 0.6513]], + + [[-0.8899, 0.4441, -0.8299, 0.1568, 1.9144, -0.7954], + [ 0.9666, -1.8733, 1.0490, 0.3950, -0.5475, 0.0102]], + + [[-0.7694, 1.4112, -0.7571, -0.2797, 1.3567, -0.9616], + [-0.8945, 1.2717, 1.4981, -0.8380, -0.2971, -0.7402]], + + [[ 1.3992, -1.0341, -1.3842, -0.0247, 0.0162, 1.0276], + [-0.8861, 0.9142, -0.5524, 0.8005, 1.1647, -1.4410]], + + [[ 0.1054, -1.9251, -0.0421, 0.2794, 1.4807, 0.1016], + [-0.5518, -0.8835, -0.7934, 0.6458, 1.9350, -0.3522]], + + [[ 1.3186, -1.4948, -1.1052, 0.1480, 0.3011, 0.8324], + [-1.0710, 1.1253, -1.0413, -0.5237, 1.4925, 0.0183]], + + [[ 0.9012, -1.3407, 0.7998, -0.7706, -0.8129, 1.2232], + [ 0.5637, -1.5301, 1.0149, 1.2128, -0.7807, -0.4805]] + ]) + + assert_equal out.shape, expected_out.shape + assert (expected_out - out).abs.lt(1e-6).all + end + + def test_transformer_decoder + Torch.manual_seed SEED + memory = Torch.randn([S, B, E]) + tgt = Torch.randn(T, B, E) + layer = Torch::NN::TransformerDecoderLayer.new(E, NHEAD) + decoder = Torch::NN::TransformerDecoder.new(layer, 4) + + expected_keys = ['layers.0.self_attn.in_proj_weight', 'layers.0.self_attn.in_proj_bias', 'layers.0.self_attn.out_proj.weight', 'layers.0.self_attn.out_proj.bias', 'layers.0.multihead_attn.in_proj_weight', 'layers.0.multihead_attn.in_proj_bias', 'layers.0.multihead_attn.out_proj.weight', 'layers.0.multihead_attn.out_proj.bias', 'layers.0.linear1.weight', 'layers.0.linear1.bias', 'layers.0.linear2.weight', 'layers.0.linear2.bias', 'layers.0.norm1.weight', 'layers.0.norm1.bias', 'layers.0.norm2.weight', 'layers.0.norm2.bias', 'layers.0.norm3.weight', 'layers.0.norm3.bias', 'layers.1.self_attn.in_proj_weight', 'layers.1.self_attn.in_proj_bias', 'layers.1.self_attn.out_proj.weight', 'layers.1.self_attn.out_proj.bias', 'layers.1.multihead_attn.in_proj_weight', 'layers.1.multihead_attn.in_proj_bias', 'layers.1.multihead_attn.out_proj.weight', 'layers.1.multihead_attn.out_proj.bias', 'layers.1.linear1.weight', 'layers.1.linear1.bias', 'layers.1.linear2.weight', 'layers.1.linear2.bias', 'layers.1.norm1.weight', 'layers.1.norm1.bias', 'layers.1.norm2.weight', 'layers.1.norm2.bias', 'layers.1.norm3.weight', 'layers.1.norm3.bias', 'layers.2.self_attn.in_proj_weight', 'layers.2.self_attn.in_proj_bias', 'layers.2.self_attn.out_proj.weight', 'layers.2.self_attn.out_proj.bias', 'layers.2.multihead_attn.in_proj_weight', 'layers.2.multihead_attn.in_proj_bias', 'layers.2.multihead_attn.out_proj.weight', 'layers.2.multihead_attn.out_proj.bias', 'layers.2.linear1.weight', 'layers.2.linear1.bias', 'layers.2.linear2.weight', 'layers.2.linear2.bias', 'layers.2.norm1.weight', 'layers.2.norm1.bias', 'layers.2.norm2.weight', 'layers.2.norm2.bias', 'layers.2.norm3.weight', 'layers.2.norm3.bias', 'layers.3.self_attn.in_proj_weight', 'layers.3.self_attn.in_proj_bias', 'layers.3.self_attn.out_proj.weight', 'layers.3.self_attn.out_proj.bias', 'layers.3.multihead_attn.in_proj_weight', 'layers.3.multihead_attn.in_proj_bias', 'layers.3.multihead_attn.out_proj.weight', 'layers.3.multihead_attn.out_proj.bias', 'layers.3.linear1.weight', 'layers.3.linear1.bias', 'layers.3.linear2.weight', 'layers.3.linear2.bias', 'layers.3.norm1.weight', 'layers.3.norm1.bias', 'layers.3.norm2.weight', 'layers.3.norm2.bias', 'layers.3.norm3.weight', 'layers.3.norm3.bias'] + assert_equal Set.new(decoder.state_dict.keys), Set.new(expected_keys) + + out = decoder.(tgt, memory).detach + + expected_out = Torch.tensor([ + [[ 0.9910, -1.6614, 0.4585, 1.1229, -0.8866, -0.0244], + [ 0.2247, -0.9688, 0.4191, 1.8912, -0.9096, -0.6565]], + + [[-0.0579, -0.8439, 1.1724, 0.8325, 0.5904, -1.6936], + [ 0.7203, -0.9428, 1.3076, 0.3839, 0.1755, -1.6445]], + + [[ 1.1308, -1.1648, 0.9485, 0.5929, -0.0547, -1.4527], + [-0.2060, -1.2025, 0.2268, 1.5961, 0.7484, -1.1629]], + + [[-0.2963, -0.6104, 1.0706, 1.4588, -0.1225, -1.5001], + [ 0.8797, -1.1604, 0.9647, 0.8675, -0.0712, -1.4803]] + ]) + + assert_equal out.shape, expected_out.shape + assert (expected_out - out).abs.lt(1e-6).all + end + + def test_entire_transformer + Torch.manual_seed SEED + src = Torch.randn([S, B, E]) + tgt = Torch.randn(T, B, E) + + tf = Torch::NN::Transformer.new(d_model: E, nhead: NHEAD) + out = tf.(src, tgt).detach + + expected_out = Torch.tensor([ + [[ 1.3946, 1.0311, -0.4112, -1.4705, -0.7782, 0.2342], + [ 1.3813, 0.7335, 0.4295, -1.7469, -0.3987, -0.3987]], + + [[ 0.8528, 0.2527, 1.0666, -1.0627, 0.5239, -1.6332], + [ 1.0099, 0.6658, 1.2135, -1.2414, -1.1116, -0.5361]], + + [[ 0.7495, 0.7391, 1.1455, -1.5647, -0.0059, -1.0636], + [ 0.6769, -0.6463, 1.1300, -0.6820, 1.0389, -1.5175]], + + [[ 1.0712, 0.8934, 0.2774, -1.7420, 0.3894, -0.8894], + [ 0.9592, 0.6803, 1.0008, -1.6594, -0.0541, -0.9268]] + ]) + + + assert_equal out.shape, expected_out.shape + assert (expected_out - out).abs.lt(1e-6).all + end +end From 512962c961342045a26a3877fce4d719a6d11a57 Mon Sep 17 00:00:00 2001 From: Yvan Date: Thu, 30 Sep 2021 21:12:49 +0300 Subject: [PATCH 16/28] attribute readers for ConvNd It is often useful to access convolutional layer attributes, e.g. for output shapes precalculation. --- lib/torch/nn/convnd.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/torch/nn/convnd.rb b/lib/torch/nn/convnd.rb index cb300cb4..86289d1b 100644 --- a/lib/torch/nn/convnd.rb +++ b/lib/torch/nn/convnd.rb @@ -1,6 +1,8 @@ module Torch module NN class ConvNd < Module + attr_reader :in_channels, :out_channels, :kernel_size, :stride, :padding, :dilation, :transposed, :output_paddding, :groups, :padding_mode + def initialize(in_channels, out_channels, kernel_size, stride, padding, dilation, transposed, output_padding, groups, bias, padding_mode) super() raise ArgumentError, "in_channels must be divisible by groups" if in_channels % groups != 0 From 480baa8e3e8687636b706e5060cf8539b3e591f4 Mon Sep 17 00:00:00 2001 From: Yvan Date: Thu, 30 Sep 2021 23:54:37 +0300 Subject: [PATCH 17/28] Fixed generation of square subsequent mask --- lib/torch/nn/transformer.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/torch/nn/transformer.rb b/lib/torch/nn/transformer.rb index c19b1e26..2d9912cb 100644 --- a/lib/torch/nn/transformer.rb +++ b/lib/torch/nn/transformer.rb @@ -87,7 +87,6 @@ class << self def generate_square_subsequent_mask(sz) mask = Torch.triu(Torch.ones([sz, sz])).eq(1).transpose(0, 1) mask.float.masked_fill!(mask.eq(0), -Float::INFINITY).masked_fill!(mask.eq(1), 0.0) - mask end alias :square_subsequent_mask :generate_square_subsequent_mask From 64be5d305b80653c53c33b556fce125cf016f98a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Fri, 14 Nov 2025 17:30:42 +0300 Subject: [PATCH 18/28] distributed data parallel port with the supporting features: Torch#load:map_locaion, torchrun, Tests are provided --- README.md | 28 + bin/torchrun | 6 + examples/mnist/distributed.rb | 198 +++++++ ext/torch/accelerator.cpp | 52 ++ ext/torch/distributed.cpp | 271 +++++++++ ext/torch/ext.cpp | 4 + ext/torch/tensor.cpp | 110 ++++ lib/torch.rb | 118 +++- lib/torch/accelerator.rb | 20 + lib/torch/distributed.rb | 195 +++++++ .../nn/parallel/distributed_data_parallel.rb | 101 ++++ lib/torch/torchrun.rb | 512 ++++++++++++++++++ test/distributed_test.rb | 77 +++ test/save_test.rb | 55 ++ test/support/scripts/show_ranks.rb | 7 + test/torchrun_test.rb | 33 ++ torch-rb.gemspec | 4 +- 17 files changed, 1788 insertions(+), 3 deletions(-) create mode 100755 bin/torchrun create mode 100644 examples/mnist/distributed.rb create mode 100644 ext/torch/accelerator.cpp create mode 100644 ext/torch/distributed.cpp create mode 100644 lib/torch/accelerator.rb create mode 100644 lib/torch/distributed.rb create mode 100644 lib/torch/nn/parallel/distributed_data_parallel.rb create mode 100644 lib/torch/torchrun.rb create mode 100644 test/distributed_test.rb create mode 100644 test/support/scripts/show_ranks.rb create mode 100644 test/torchrun_test.rb diff --git a/README.md b/README.md index a49c7e3f..d477c8ca 100644 --- a/README.md +++ b/README.md @@ -55,9 +55,35 @@ A good place to start is [Deep Learning with Torch.rb: A 60 Minute Blitz](tutori ## Examples - [Image classification with MNIST](examples/mnist) ([日本語版](https://qiita.com/kojix2/items/c19c36dc1bf73ea93409)) +- [Distributed MNIST training](examples/mnist/distributed.rb) - [Collaborative filtering with MovieLens](examples/movielens) - [Generative adversarial networks](examples/gan) +## Distributed Training + +Torch.rb ships with a `torchrun` launcher that mirrors the PyTorch CLI. It handles process orchestration and sets the `RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, and `MASTER_PORT` environment variables expected by `Torch::Distributed.init_process_group`. + +Start a single-node job with a process per GPU (or CPU) with: + +```sh +bundle exec torchrun --standalone --nproc-per-node=gpu path/to/training_script.rb --script-arg value +``` + +For multi-node runs, launch the same command on every node with matching rendezvous settings: + +```sh +bundle exec torchrun \ + --nnodes=2 \ + --node-rank=0 \ + --rdzv-backend=c10d \ + --rdzv-endpoint=host0.example.com:29503 \ + --rdzv-id=my-job \ + --nproc-per-node=4 \ + path/to/training_script.rb +``` + +On node 1, change `--node-rank=1`. The launcher restarts workers up to `--max-restarts` times and can be combined with tools like `bundle exec` or custom scripts via `--no-ruby`. + ## API This library follows the [PyTorch API](https://pytorch.org/docs/stable/torch.html). There are a few changes to make it more Ruby-like: @@ -329,6 +355,8 @@ net.load_state_dict(Torch.load("net.pth")) net.eval ``` +`Torch.load` mirrors the Python API and accepts `map_location` and `weights_only` keyword arguments for compatibility with existing PyTorch checkpoints. + When saving a model in Python to load in Ruby, convert parameters to tensors (due to outstanding bugs in LibTorch) ```python diff --git a/bin/torchrun b/bin/torchrun new file mode 100755 index 00000000..d698ade2 --- /dev/null +++ b/bin/torchrun @@ -0,0 +1,6 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require_relative "../lib/torch/torchrun" + +Torch::TorchRun.start(ARGV) diff --git a/examples/mnist/distributed.rb b/examples/mnist/distributed.rb new file mode 100644 index 00000000..531b0f87 --- /dev/null +++ b/examples/mnist/distributed.rb @@ -0,0 +1,198 @@ +# Distributed MNIST training with Torch::Distributed + DistributedDataParallel +# Run with: ruby examples/mnist/distributed.rb --gpus 2 + +require "bundler/setup" +require "optparse" +require "torch" +require "torchvision" +require "socket" + +unless Torch::Distributed.available? + abort "torch.distributed was not built in this binary" +end + +class MyNet < Torch::NN::Module + def initialize + super() + @conv1 = Torch::NN::Conv2d.new(1, 32, 3, stride: 1) + @conv2 = Torch::NN::Conv2d.new(32, 64, 3, stride: 1) + @dropout1 = Torch::NN::Dropout2d.new(p: 0.25) + @dropout2 = Torch::NN::Dropout2d.new(p: 0.5) + @fc1 = Torch::NN::Linear.new(9216, 128) + @fc2 = Torch::NN::Linear.new(128, 10) + end + + def forward(x) + x = Torch::NN::F.relu(@conv1.call(x)) + x = Torch::NN::F.relu(@conv2.call(x)) + x = Torch::NN::F.max_pool2d(x, 2) + x = @dropout1.call(x) + x = Torch.flatten(x, start_dim: 1) + x = Torch::NN::F.relu(@fc1.call(x)) + x = @dropout2.call(x) + Torch::NN::F.log_softmax(@fc2.call(x), 1) + end +end + +def parse_options + defaults = { + epochs: 5, + batch_size: 64, + lr: 1.0, + gamma: 0.7, + backend: "gloo", + gpus: Torch::CUDA.available? ? [Torch::CUDA.device_count, 1].max : 1, + log_interval: 20, + data_dir: File.join(__dir__, "data") + } + + OptionParser.new do |opts| + opts.banner = "Usage: ruby distributed.rb [options]" + opts.on("--epochs N", Integer, "Number of epochs (default: #{defaults[:epochs]})") { |v| defaults[:epochs] = v } + opts.on("--batch-size N", Integer, "Batch size per process (default: #{defaults[:batch_size]})") { |v| defaults[:batch_size] = v } + opts.on("--lr FLOAT", Float, "Learning rate (default: #{defaults[:lr]})") { |v| defaults[:lr] = v } + opts.on("--gamma FLOAT", Float, "LR scheduler gamma (default: #{defaults[:gamma]})") { |v| defaults[:gamma] = v } + opts.on("--backend NAME", String, "Process group backend (default: #{defaults[:backend]})") { |v| defaults[:backend] = v } + opts.on("--gpus N", Integer, "Number of GPUs/processes to use") { |v| defaults[:gpus] = v } + opts.on("--log-interval N", Integer, "Batches between log statements") { |v| defaults[:log_interval] = v } + opts.on("--data-dir PATH", String, "Directory for cached MNIST data") { |v| defaults[:data_dir] = v } + end.parse!(ARGV) + + defaults +end + +def free_port + server = TCPServer.new("127.0.0.1", 0) + port = server.addr[1] + server.close + port +end + +def spawn_workers(world_size) + port = free_port + + world_size.times.map do |rank| + fork do + yield(rank, world_size, port) + end + end.each { Process.wait2(_1) } +end + +def load_datasets(rank, data_dir) + transforms = TorchVision::Transforms::Compose.new([ + TorchVision::Transforms::ToTensor.new, + TorchVision::Transforms::Normalize.new([0.1307], [0.3081]) + ]) + + if rank.zero? + train = TorchVision::Datasets::MNIST.new(data_dir, train: true, download: true, transform: transforms) + test = TorchVision::Datasets::MNIST.new(data_dir, train: false, download: true, transform: transforms) + Torch::Distributed.barrier + else + Torch::Distributed.barrier + train = TorchVision::Datasets::MNIST.new(data_dir, train: true, download: false, transform: transforms) + test = TorchVision::Datasets::MNIST.new(data_dir, train: false, download: false, transform: transforms) + end + + [train, test] +end + +def subset_for_rank(dataset, rank, world_size) + indices = rank.step(dataset.size - 1, world_size).to_a + Torch::Utils::Data::Subset.new(dataset, indices) +end + +def train_epoch(model, device, loader, optimizer, epoch, rank, log_interval) + model.train + loader.each_with_index do |(data, target), batch_idx| + data = data.to(device) + target = target.to(device) + + optimizer.zero_grad + loss = Torch::NN::F.nll_loss(model.call(data), target) + loss.backward + optimizer.step + + next unless rank.zero? && (batch_idx % log_interval).zero? + + processed = batch_idx * data.size(0) + total = loader.dataset.size + percent = 100.0 * processed / total + puts "Rank #{rank} | Epoch #{epoch} [#{processed}/#{total} (#{percent.round})%] Loss: #{'%.4f' % loss.item}" + end +end + +def evaluate(model, device, loader) + model.eval + loss = 0.0 + correct = 0 + Torch.no_grad do + loader.each do |data, target| + data = data.to(device) + target = target.to(device) + output = model.call(data) + loss += Torch::NN::F.nll_loss(output, target, reduction: "sum").item + pred = output.argmax(1, keepdim: true) + correct += pred.eq(target.view_as(pred)).sum.item + end + end + + loss /= loader.dataset.size + acc = 100.0 * correct / loader.dataset.size + puts "Test set: Average loss: #{format('%.4f', loss)}, Accuracy: #{correct}/#{loader.dataset.size} (#{format('%.1f', acc)}%)" +end + +def run_worker(rank, world_size, port, options) + store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?) + accelerator = Torch::Accelerator.current_accelerator + backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator) + Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size) + + device = if Torch::CUDA.available? && options[:gpus] > 0 + Torch.device("cuda:#{rank % Torch::CUDA.device_count}") + else + Torch.device("cpu") + end + + model = MyNet.new.to(device) + ddp = Torch::NN::Parallel::DistributedDataParallel.new(model, device_ids: device.type == "cuda" ? [device.index] : nil) + optimizer = Torch::Optim::Adadelta.new(ddp.module.parameters, lr: options[:lr]) + scheduler = Torch::Optim::LRScheduler::StepLR.new(optimizer, step_size: 1, gamma: options[:gamma]) + + train_dataset, test_dataset = load_datasets(rank, options[:data_dir]) + train_subset = subset_for_rank(train_dataset, rank, world_size) + train_loader = Torch::Utils::Data::DataLoader.new(train_subset, batch_size: options[:batch_size], shuffle: true) + test_loader = Torch::Utils::Data::DataLoader.new(test_dataset, batch_size: options[:batch_size], shuffle: false) if rank.zero? + + options[:epochs].times do |epoch_idx| + epoch = epoch_idx + 1 + train_epoch(ddp, device, train_loader, optimizer, epoch, rank, options[:log_interval]) + if rank.zero? + evaluate(ddp.module, device, test_loader) + end + end + + Torch::Distributed.destroy_process_group +end + +options = parse_options +world_size = options[:gpus] +raise "Number of GPUs requested must be >= 1" if world_size < 1 +if Torch::CUDA.available? + max_devices = Torch::CUDA.device_count + if world_size > max_devices + raise "Requested #{world_size} GPUs but only #{max_devices} visible" + end +else + puts "CUDA not available, running #{world_size} CPU workers" +end + +Torch.manual_seed(1) + +if world_size == 1 + run_worker(0, 1, free_port, options) +else + spawn_workers(world_size) do |rank, total, port| + run_worker(rank, total, port, options) + end +end diff --git a/ext/torch/accelerator.cpp b/ext/torch/accelerator.cpp new file mode 100644 index 00000000..45cfcb41 --- /dev/null +++ b/ext/torch/accelerator.cpp @@ -0,0 +1,52 @@ +#include +#include +#include + +#include + +#include "utils.h" + +namespace { + +inline bool accelerator_available(c10::DeviceType device_type) { + return at::globalContext() + .getAcceleratorHooksInterface(device_type) + .isAvailable(); +} + +} // namespace + +void init_accelerator(Rice::Module& m) { + auto rb_mAccelerator = Rice::define_module_under(m, "Accelerator"); + + rb_mAccelerator.define_singleton_function( + "_current_device", + []() -> VALUE { + auto acc = at::getAccelerator(false); + if (!acc.has_value()) { + return Rice::Nil; + } + torch::Device device(acc.value()); + return Rice::detail::To_Ruby().convert(device); + }); + + rb_mAccelerator.define_singleton_function( + "_is_available", + []() { + auto acc = at::getAccelerator(false); + if (!acc.has_value()) { + return false; + } + return accelerator_available(acc.value()); + }); + + rb_mAccelerator.define_singleton_function( + "_device_count", + []() { + auto acc = at::getAccelerator(false); + if (!acc.has_value()) { + return 0; + } + return static_cast(at::accelerator::deviceCount()); + }); +} diff --git a/ext/torch/distributed.cpp b/ext/torch/distributed.cpp new file mode 100644 index 00000000..de5f7c9a --- /dev/null +++ b/ext/torch/distributed.cpp @@ -0,0 +1,271 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "utils.h" + +#ifdef USE_C10D +#include +#include +#include +#include +#include +#include +#endif + +#if defined(USE_C10D) && defined(USE_C10D_NCCL) +#include +#endif + +#if defined(USE_C10D) && !defined(_WIN32) +#include +#endif + +namespace { + +#ifdef USE_C10D + +using StorePtr = c10::intrusive_ptr<::c10d::Store>; +using ProcessGroupPtr = c10::intrusive_ptr<::c10d::ProcessGroup>; + +struct StoreWrapper { + StoreWrapper() = default; + explicit StoreWrapper(StorePtr store) : store_(std::move(store)) {} + + StorePtr store_; +}; + +struct ProcessGroupWrapper { + ProcessGroupWrapper() = default; + explicit ProcessGroupWrapper(ProcessGroupPtr pg) : pg_(std::move(pg)) {} + + ProcessGroupPtr pg_; +}; + +ProcessGroupPtr default_process_group; + +ProcessGroupPtr resolve_process_group(Rice::Object pg_obj) { + if (pg_obj.is_nil()) { + if (!default_process_group) { + rb_raise(rb_eRuntimeError, "Distributed process group not initialized"); + } + return default_process_group; + } + auto& wrapper = Rice::detail::From_Ruby().convert(pg_obj.value()); + if (!wrapper.pg_) { + rb_raise(rb_eRuntimeError, "Invalid process group"); + } + return wrapper.pg_; +} + +int reduce_op_from_int(int code) { + if (code < 0 || code > static_cast(::c10d::ReduceOp::UNUSED)) { + rb_raise(rb_eArgError, "Unknown reduce op code"); + } + return code; +} + +#endif + +} // namespace + +void init_distributed(Rice::Module& m) { + auto rb_mDistributed = Rice::define_module_under(m, "Distributed"); +#ifdef USE_C10D + rb_mDistributed.define_singleton_function("available?", []() { return true; }); + + auto rb_cStore = Rice::define_class_under(rb_mDistributed, "Store"); + rb_cStore.define_method( + "_native?", + [](StoreWrapper& self) { + return static_cast(self.store_); + }); + + auto rb_cProcessGroup = Rice::define_class_under(rb_mDistributed, "ProcessGroup") + .define_method( + "rank", + [](ProcessGroupWrapper& self) { + return self.pg_ ? self.pg_->getRank() : -1; + }) + .define_method( + "size", + [](ProcessGroupWrapper& self) { + return self.pg_ ? self.pg_->getSize() : 0; + }) + .define_method( + "backend", + [](ProcessGroupWrapper& self) { + if (!self.pg_) { + return std::string(); + } + return self.pg_->getBackendName(); + }); + + rb_mDistributed.define_singleton_function( + "_create_tcp_store", + [rb_cStore](const std::string& host, + int port, + int world_size, + bool is_master, + int64_t timeout_millis, + bool wait_for_workers) { + ::c10d::TCPStoreOptions opts; + opts.port = static_cast(port); + opts.isServer = is_master; + opts.numWorkers = world_size; + opts.waitWorkers = wait_for_workers; + opts.timeout = std::chrono::milliseconds(timeout_millis); + auto store = c10::make_intrusive<::c10d::TCPStore>(host, opts); + return Rice::Data_Object(new StoreWrapper(store), rb_cStore, true); + }); + + rb_mDistributed.define_singleton_function( + "_create_file_store", + [rb_cStore](const std::string& path, int world_size) { + auto store = c10::make_intrusive<::c10d::FileStore>(path, world_size); + return Rice::Data_Object(new StoreWrapper(store), rb_cStore, true); + }); + +#if !defined(_WIN32) + rb_mDistributed.define_singleton_function( + "_create_hash_store", + [rb_cStore]() { + auto store = c10::make_intrusive<::c10d::HashStore>(); + return Rice::Data_Object(new StoreWrapper(store), rb_cStore, true); + }); +#endif + + rb_mDistributed.define_singleton_function( + "_init_process_group", + [rb_cProcessGroup](const std::string& backend, + StoreWrapper& store_wrapper, + int rank, + int world_size, + int64_t timeout_millis) { + StorePtr store = store_wrapper.store_; + if (!store) { + rb_raise(rb_eArgError, "Store is required for init_process_group"); + } + + std::string backend_lower = backend; + std::transform(backend_lower.begin(), backend_lower.end(), backend_lower.begin(), ::tolower); + + ProcessGroupPtr pg; + if (backend_lower == "gloo") { +#ifdef USE_C10D_GLOO + auto options = ::c10d::ProcessGroupGloo::Options::create(); + options->timeout = std::chrono::milliseconds(timeout_millis); + options->devices.push_back(::c10d::ProcessGroupGloo::createDefaultDevice()); + pg = c10::make_intrusive<::c10d::ProcessGroupGloo>(store, rank, world_size, options); +#else + rb_raise(rb_eRuntimeError, "Gloo backend is not available in this build"); +#endif + } else if (backend_lower == "nccl") { +#if defined(USE_C10D_NCCL) + auto options = c10::make_intrusive<::c10d::ProcessGroupNCCL::Options>(); + pg = c10::make_intrusive<::c10d::ProcessGroupNCCL>(store, rank, world_size, options); +#else + rb_raise(rb_eRuntimeError, "NCCL backend is not available in this build"); +#endif + } else { + rb_raise(rb_eArgError, "Unsupported backend: %s", backend.c_str()); + } + + default_process_group = pg; + return Rice::Data_Object(new ProcessGroupWrapper(pg), rb_cProcessGroup, true); + }); + + rb_mDistributed.define_singleton_function( + "_destroy_process_group", + []() { + default_process_group.reset(); + return Rice::Nil; + }); + + rb_mDistributed.define_singleton_function( + "_initialized?", + []() { + return static_cast(default_process_group); + }); + + rb_mDistributed.define_singleton_function( + "_default_process_group", + [rb_cProcessGroup]() { + if (!default_process_group) { + return Rice::Nil; + } + return Rice::Data_Object(new ProcessGroupWrapper(default_process_group), rb_cProcessGroup, true); + }); + + rb_mDistributed.define_singleton_function( + "_get_world_size", + [](Rice::Object pg_obj) { + auto pg = resolve_process_group(pg_obj); + return pg->getSize(); + }); + + rb_mDistributed.define_singleton_function( + "_get_rank", + [](Rice::Object pg_obj) { + auto pg = resolve_process_group(pg_obj); + return pg->getRank(); + }); + + rb_mDistributed.define_singleton_function( + "_barrier", + [](Rice::Object pg_obj) { + auto pg = resolve_process_group(pg_obj); + ::c10d::BarrierOptions opts; + auto work = pg->barrier(opts); + work->wait(); + return Rice::Nil; + }); + + rb_mDistributed.define_singleton_function( + "_all_reduce", + [](torch::Tensor& tensor, int op_code, Rice::Object pg_obj) { + auto pg = resolve_process_group(pg_obj); + ::c10d::AllreduceOptions opts; + opts.reduceOp = ::c10d::ReduceOp(static_cast<::c10d::ReduceOp::RedOpType>(reduce_op_from_int(op_code))); + std::vector tensors{tensor}; + auto work = pg->allreduce(tensors, opts); + work->wait(); + return tensor; + }); + + rb_mDistributed.define_singleton_function( + "_broadcast", + [](torch::Tensor& tensor, int src, Rice::Object pg_obj) { + auto pg = resolve_process_group(pg_obj); + ::c10d::BroadcastOptions opts; + opts.rootRank = src; + std::vector tensors{tensor}; + auto work = pg->broadcast(tensors, opts); + work->wait(); + return tensor; + }); + + auto rb_mReduceOp = Rice::define_module_under(rb_mDistributed, "ReduceOp"); + rb_mReduceOp.const_set("SUM", INT2NUM(static_cast(::c10d::ReduceOp::SUM))); + rb_mReduceOp.const_set("AVG", INT2NUM(static_cast(::c10d::ReduceOp::AVG))); + rb_mReduceOp.const_set("PRODUCT", INT2NUM(static_cast(::c10d::ReduceOp::PRODUCT))); + rb_mReduceOp.const_set("MIN", INT2NUM(static_cast(::c10d::ReduceOp::MIN))); + rb_mReduceOp.const_set("MAX", INT2NUM(static_cast(::c10d::ReduceOp::MAX))); + rb_mReduceOp.const_set("BAND", INT2NUM(static_cast(::c10d::ReduceOp::BAND))); + rb_mReduceOp.const_set("BOR", INT2NUM(static_cast(::c10d::ReduceOp::BOR))); + rb_mReduceOp.const_set("BXOR", INT2NUM(static_cast(::c10d::ReduceOp::BXOR))); + rb_mReduceOp.const_set("PREMUL_SUM", INT2NUM(static_cast(::c10d::ReduceOp::PREMUL_SUM))); + + rb_mDistributed.const_set("DEFAULT_TIMEOUT", INT2NUM(::c10d::kProcessGroupDefaultTimeout.count() / 1000)); +#else + rb_mDistributed.define_singleton_function("available?", []() { return false; }); +#endif +} diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp index eb6fb7d3..dc9cef20 100644 --- a/ext/torch/ext.cpp +++ b/ext/torch/ext.cpp @@ -6,6 +6,8 @@ void init_fft(Rice::Module& m); void init_linalg(Rice::Module& m); void init_nn(Rice::Module& m); void init_special(Rice::Module& m); +void init_accelerator(Rice::Module& m); +void init_distributed(Rice::Module& m); void init_tensor(Rice::Module& m, Rice::Class& c, Rice::Class& rb_cTensorOptions); void init_torch(Rice::Module& m); @@ -40,10 +42,12 @@ void Init_ext() { init_fft(m); init_linalg(m); init_special(m); + init_accelerator(m); init_backends(m); init_cuda(m); init_generator(m, rb_cGenerator); init_ivalue(m, rb_cIValue); init_random(m); + init_distributed(m); } diff --git a/ext/torch/tensor.cpp b/ext/torch/tensor.cpp index 390b5a9c..c7e003d8 100644 --- a/ext/torch/tensor.cpp +++ b/ext/torch/tensor.cpp @@ -1,9 +1,12 @@ +#include #include +#include #include #include #include +#include #include "tensor_functions.h" #include "ruby_arg_parser.h" @@ -26,6 +29,103 @@ Array flat_data(Tensor& tensor) { } Rice::Class rb_cTensor; +Rice::Class rb_cHookHandle; + +namespace { + +struct RubyTensorHook { + explicit RubyTensorHook(VALUE proc) : proc_(proc) { + rb_gc_register_address(&proc_); + } + + ~RubyTensorHook() { + rb_gc_unregister_address(&proc_); + } + + at::Tensor call(const at::Tensor& grad) { + HookCallData data{proc_, grad}; + rb_thread_call_with_gvl(&RubyTensorHook::invoke, &data); + if (data.return_value_defined) { + return data.return_tensor; + } + return grad; + } + + private: + struct HookCallData { + VALUE proc; + at::Tensor grad; + at::Tensor return_tensor; + bool return_value_defined = false; + }; + + static void* invoke(void* arg) { + auto* data = reinterpret_cast(arg); + VALUE grad_obj = Rice::detail::To_Ruby().convert(data->grad); + VALUE result = rb_funcall(data->proc, rb_intern("call"), 1, grad_obj); + if (!NIL_P(result)) { + data->return_tensor = Rice::detail::From_Ruby().convert(result); + data->return_value_defined = true; + } + return nullptr; + } + + VALUE proc_; +}; + +class HookHandle { + public: + HookHandle(const at::Tensor& tensor, unsigned handle, std::shared_ptr hook) + : tensor_(tensor), handle_(handle), hook_(std::move(hook)), removed_(false) {} + + HookHandle(const HookHandle& other) = default; + HookHandle& operator=(const HookHandle& other) = default; + + ~HookHandle() { + remove(); + } + + void remove() { + if (!removed_) { + tensor_.remove_hook(handle_); + removed_ = true; + hook_.reset(); + } + } + + private: + at::Tensor tensor_; + unsigned handle_; + std::shared_ptr hook_; + bool removed_; +}; + +VALUE tensor_register_hook(int argc, VALUE* argv, VALUE self_) { + HANDLE_TH_ERRORS + VALUE callable = Qnil; + rb_scan_args(argc, argv, "01", &callable); + if (NIL_P(callable)) { + if (rb_block_given_p()) { + callable = rb_block_proc(); + } else { + rb_raise(rb_eArgError, "Expected a callable or block"); + } + } + if (!rb_respond_to(callable, rb_intern("call"))) { + rb_raise(rb_eArgError, "Hook must respond to call"); + } + + Tensor& self = Rice::detail::From_Ruby().convert(self_); + auto hook = std::make_shared(callable); + unsigned handle = self.register_hook([hook](const at::Tensor& grad) { + return hook->call(grad); + }); + + return Rice::Data_Object(new HookHandle(self, handle, hook), rb_cHookHandle, true); + END_HANDLE_TH_ERRORS +} + +} // namespace std::vector index_vector(Array a) { Object obj; @@ -102,7 +202,17 @@ void init_tensor(Rice::Module& m, Rice::Class& c, Rice::Class& rb_cTensorOptions add_tensor_functions(rb_cTensor); THPVariableClass = rb_cTensor.value(); + auto rb_mAutograd = Rice::define_module_under(m, "Autograd"); + rb_cHookHandle = Rice::define_class_under(rb_mAutograd, "RemovableHandle") + .define_method( + "remove", + [](HookHandle& self) { + self.remove(); + return Rice::Nil; + }); + rb_define_method(rb_cTensor, "backward", (VALUE (*)(...)) tensor__backward, -1); + rb_define_method(rb_cTensor, "register_hook", (VALUE (*)(...)) tensor_register_hook, -1); rb_cTensor .define_method("cuda?", [](Tensor& self) { return self.is_cuda(); }) diff --git a/lib/torch.rb b/lib/torch.rb index 266c2859..ce213e1d 100644 --- a/lib/torch.rb +++ b/lib/torch.rb @@ -9,6 +9,8 @@ # modules require_relative "torch/device" +require_relative "torch/accelerator" +require_relative "torch/distributed" require_relative "torch/inspector" require_relative "torch/tensor" require_relative "torch/version" @@ -191,6 +193,7 @@ require_relative "torch/nn/functional" require_relative "torch/nn/functional_attention" require_relative "torch/nn/init" +require_relative "torch/nn/parallel/distributed_data_parallel" # utils require_relative "torch/utils/data" @@ -399,11 +402,14 @@ def save(obj, f) File.binwrite(f, _save(to_ivalue(obj))) end - def load(filename) + def load(filename, map_location: nil, weights_only: false) # keep backwards compatibility File.open(filename, "rb") { |f| f.read(1) } - to_ruby(_load(filename)) + result = to_ruby(_load(filename)) + ensure_weights_only_contents!(result) if weights_only + result = apply_map_location(result, map_location) if map_location + result end def tensor(data, **options) @@ -536,6 +542,114 @@ def to_ruby(ivalue) end end + WEIGHTS_ONLY_PRIMITIVE_CLASSES = + [ + NilClass, + TrueClass, + FalseClass, + Integer, + Float, + String + ].freeze + + def ensure_weights_only_contents!(obj) + case obj + when *WEIGHTS_ONLY_PRIMITIVE_CLASSES + obj + when Tensor + obj + when Array + obj.each { |value| ensure_weights_only_contents!(value) } + when Hash + obj.each do |key, value| + ensure_weights_only_contents!(key) + ensure_weights_only_contents!(value) + end + else + raise Error, "weights_only load supports tensors, primitive Ruby types, arrays, and hashes (found #{obj.class.name})" + end + end + + def apply_map_location(obj, map_location) + case obj + when Tensor + map_tensor_location(obj, map_location) + when Array + obj.map { |value| apply_map_location(value, map_location) } + when Hash + obj.each_with_object({}) do |(key, value), memo| + memo[apply_map_location(key, map_location)] = apply_map_location(value, map_location) + end + else + obj + end + end + + def map_tensor_location(tensor, map_location) + case map_location + when nil + tensor + when Hash + target = lookup_map_location_target(map_location, tensor.device) + return tensor if target.nil? + map_tensor_location(tensor, target) + else + return map_tensor_location_callable(tensor, map_location) if map_location.respond_to?(:call) + device = normalize_map_location_device(map_location) + tensor.to(device) + end + end + + def map_tensor_location_callable(tensor, callable) + mapped = callable.call(tensor, map_location_device_tag(tensor.device)) + return tensor if mapped.nil? + unless mapped.is_a?(Tensor) + raise Error, "map_location callable must return a Tensor or nil (got #{mapped.class.name})" + end + mapped + end + + def lookup_map_location_target(mapping, device) + key = map_location_device_tag(device) + mapping.each do |candidate, value| + candidate_key = + case candidate + when Device + map_location_device_tag(candidate) + when String, Symbol + candidate.to_s + else + candidate + end + return value if candidate_key == key + end + nil + end + + def map_location_device_tag(device) + case device + when Device + tag = device.type + tag += ":#{device.index}" unless device.index.nil? + tag + when String, Symbol + device.to_s + else + raise Error, "Unknown device reference: #{device.inspect}" + end + end + + def normalize_map_location_device(location) + case location + when Device + location + when String, Symbol + device(location.to_s) + else + raise Error, "Unsupported map_location: #{location.inspect}" + end + end + def tensor_size(size) size.flatten end diff --git a/lib/torch/accelerator.rb b/lib/torch/accelerator.rb new file mode 100644 index 00000000..abfd95bb --- /dev/null +++ b/lib/torch/accelerator.rb @@ -0,0 +1,20 @@ +module Torch + module Accelerator + class << self + def current_accelerator(check_available: false) + device = _current_device + return nil unless device + return nil if check_available && !available? + device + end + + def device_count + _device_count + end + + def available? + _is_available + end + end + end +end diff --git a/lib/torch/distributed.rb b/lib/torch/distributed.rb new file mode 100644 index 00000000..77428b52 --- /dev/null +++ b/lib/torch/distributed.rb @@ -0,0 +1,195 @@ +require "socket" + +module Torch + module Distributed + DEFAULT_DEVICE_BACKENDS = { + "cpu" => "gloo", + "cuda" => "nccl", + "xpu" => "xccl", + "mps" => "gloo" + }.freeze + + class << self + def initialized? + _initialized? + end + + def init_process_group(backend = nil, init_method: "env://", store: nil, rank: nil, world_size: nil, timeout: DEFAULT_TIMEOUT, wait_for_workers: true, device_id: nil) + raise Torch::Error, "torch.distributed is not available" unless available? + + backend ||= default_backend_for(device_id) + + if store.nil? + case init_method + when "env://" + rank = Integer(ENV.fetch("RANK")) if rank.nil? + world_size = Integer(ENV.fetch("WORLD_SIZE")) if world_size.nil? + master_addr = ENV.fetch("MASTER_ADDR", "127.0.0.1") + master_port = Integer(ENV.fetch("MASTER_PORT", "29500")) + raise ArgumentError, "rank is required" if rank.nil? + raise ArgumentError, "world_size is required" if world_size.nil? + is_master = rank.zero? + store = TCPStore.new(master_addr, master_port, world_size, is_master, wait_for_workers: wait_for_workers, timeout: timeout) + else + raise ArgumentError, "store is required when using init_method=#{init_method.inspect}" + end + end + + raise ArgumentError, "rank is required" if rank.nil? + raise ArgumentError, "world_size is required" if world_size.nil? + + timeout_ms = (timeout * 1000).to_i + _init_process_group(backend, store, rank, world_size, timeout_ms) + end + + def destroy_process_group + _destroy_process_group + end + + def default_process_group + _default_process_group + end + + def get_world_size(group = nil) + ensure_process_group!(group) + _get_world_size(group) + end + + def get_rank(group = nil) + ensure_process_group!(group) + _get_rank(group) + end + + def barrier(group: nil) + ensure_process_group!(group) + _barrier(group) + end + + def all_reduce(tensor, op: ReduceOp::SUM, group: nil) + ensure_process_group!(group) + _all_reduce(tensor, op, group) + end + + def broadcast(tensor, src:, group: nil) + ensure_process_group!(group) + _broadcast(tensor, src, group) + end + + def get_default_backend_for_device(device) + backend = DEFAULT_DEVICE_BACKENDS[device_type_from(device)] + raise ArgumentError, "Default backend not registered for device: #{device.inspect}" unless backend + backend + end + + def fork_world(world_size, host: "127.0.0.1") + raise ArgumentError, "world_size must be positive" unless world_size.to_i.positive? + raise ArgumentError, "block required" unless block_given? + + port = free_port(host: host) + readers = [] + pids = [] + world_size.times do |rank| + reader, writer = IO.pipe + pid = fork do + reader.close + begin + writer.binmode + result = yield(rank, port) + Marshal.dump(result, writer) + exit! 0 + rescue => e + Marshal.dump({error: "#{e.class}: #{e.message}", backtrace: e.backtrace}, writer) + exit! 1 + ensure + writer.close unless writer.closed? + end + end + writer.close + readers << reader + pids << pid + end + + outputs = readers.map do |reader| + data = Marshal.load(reader) + reader.close + data + end + + statuses = pids.each_with_index.map do |pid, idx| + _pid, status = Process.wait2(pid) + [idx, pid, status] + end + + statuses.each do |idx, pid, status| + output = outputs[idx] + if !status.success? || (output.is_a?(Hash) && output[:error]) + message = if output.is_a?(Hash) && output[:error] + "Child #{pid} failed: #{output[:error]}\n#{Array(output[:backtrace]).join("\n")}" + else + "Child #{pid} exited with status #{status.exitstatus}" + end + raise Torch::Error, message + end + end + + outputs + end + + def free_port(host: "127.0.0.1") + server = TCPServer.new(host, 0) + port = server.addr[1] + server.close + port + end + + private + + def ensure_process_group!(group) + return if group || initialized? + + raise Torch::Error, "Default process group is not initialized" + end + + def default_backend_for(device_id) + get_default_backend_for_device(device_id) + end + + def device_type_from(device) + case device + when Torch::Device + device.type + when String + Torch.device(device).type + when Integer + Torch.device("cuda:#{device}").type + when NilClass + Torch::Accelerator.current_accelerator&.type || "cpu" + else + Torch.device(device).type + end + rescue => e + raise ArgumentError, "Invalid device #{device.inspect}: #{e.message}" + end + end + + class TCPStore + def self.new(host, port, world_size, is_master, wait_for_workers: true, timeout: DEFAULT_TIMEOUT) + Torch::Distributed._create_tcp_store(host, port, world_size, is_master, (timeout * 1000).to_i, wait_for_workers) + end + end + + class FileStore + def self.new(path, world_size) + Torch::Distributed._create_file_store(path, world_size) + end + end + + if respond_to?(:_create_hash_store) + class HashStore + def self.new + Torch::Distributed._create_hash_store + end + end + end + end +end diff --git a/lib/torch/nn/parallel/distributed_data_parallel.rb b/lib/torch/nn/parallel/distributed_data_parallel.rb new file mode 100644 index 00000000..87178f3b --- /dev/null +++ b/lib/torch/nn/parallel/distributed_data_parallel.rb @@ -0,0 +1,101 @@ +module Torch + module NN + module Parallel + class DistributedDataParallel < Module + attr_reader :module, :process_group + + def initialize(mod, device_ids: nil, process_group: nil, broadcast_buffers: true) + super() + raise Torch::Error, "torch.distributed is not available" unless Torch::Distributed.available? + + @module = mod + @broadcast_buffers = broadcast_buffers + @process_group = process_group || Torch::Distributed.default_process_group + raise Torch::Error, "Process group must be initialized before using DistributedDataParallel" unless @process_group + + @world_size = Torch::Distributed.get_world_size(@process_group) + @rank = Torch::Distributed.get_rank(@process_group) + @device = Array(device_ids).compact.first + move_to_device(@device) if @device + + synchronize_parameters + @hook_handles = register_parameter_hooks + end + + def forward(*inputs, **kwargs) + outputs = @module.call(*move_inputs(inputs), **move_kwargs(kwargs)) + broadcast_buffers_if_needed + outputs + end + + alias_method :call, :forward + + def train(mode = true) + @module.train(mode) + broadcast_buffers_if_needed + self + end + + private + + def move_to_device(device) + return unless device + + @module.to(device) + end + + def move_inputs(inputs) + return inputs unless @device + + inputs.map { |value| move_value(value, @device) } + end + + def move_kwargs(kwargs) + return kwargs unless @device + + kwargs.transform_values { |value| move_value(value, @device) } + end + + def move_value(value, device) + case value + when Torch::Tensor + value.to(device) + when Array + value.map { |v| move_value(v, device) } + when Hash + value.transform_values { |v| move_value(v, device) } + else + value + end + end + + def synchronize_parameters + Torch::Distributed.barrier(group: @process_group) + @module.parameters.each do |param| + Torch::Distributed.broadcast(param, src: 0, group: @process_group) + end + broadcast_buffers_if_needed + end + + def broadcast_buffers_if_needed + return unless @broadcast_buffers + + @module.buffers.each do |buffer| + Torch::Distributed.broadcast(buffer, src: 0, group: @process_group) + end + end + + def register_parameter_hooks + @module.parameters.filter_map do |param| + next unless param.requires_grad? + + param.register_hook do |grad| + Torch::Distributed.all_reduce(grad, group: @process_group) + grad.div!(@world_size.to_f) + end + end + end + end + end + end +end diff --git a/lib/torch/torchrun.rb b/lib/torch/torchrun.rb new file mode 100644 index 00000000..b5913334 --- /dev/null +++ b/lib/torch/torchrun.rb @@ -0,0 +1,512 @@ +# frozen_string_literal: true + +require "optparse" +require "socket" +require "etc" +require "securerandom" +require "rbconfig" + +require_relative "../torch" + +module Torch + module TorchRun + SIGNALS = %w[INT TERM QUIT].freeze + + class Error < StandardError; end + + class Parser + attr_reader :parser + + def initialize + @parser = OptionParser.new + end + + def parse(argv) + options = default_options + + parser.banner = "Usage: torchrun [options] TRAINING_SCRIPT [script args]" + parser.separator "" + parser.separator "Launch parameters:" + + parser.on("--nnodes MIN[:MAX]", String, "Number of nodes or range (default: #{options[:nnodes]})") do |value| + options[:nnodes] = value + end + + parser.on("--nproc-per-node VALUE", String, "Processes per node (int, gpu, cpu, auto). Default: #{options[:nproc_per_node]}") do |value| + options[:nproc_per_node] = value + end + + parser.on("--node-rank VALUE", Integer, "Rank of the node for multi-node jobs. Default: #{options[:node_rank]}") do |value| + options[:node_rank] = value + end + + parser.on("--rdzv-backend NAME", String, "Rendezvous backend (static or c10d). Default: #{options[:rdzv_backend]}") do |value| + options[:rdzv_backend] = value + end + + parser.on("--rdzv-endpoint HOST[:PORT]", String, "Rendezvous endpoint. Default: use --master-addr/--master-port") do |value| + options[:rdzv_endpoint] = value + end + + parser.on("--rdzv-id ID", String, "User defined job id. Default: #{options[:rdzv_id]}") do |value| + options[:rdzv_id] = value + end + + parser.on("--rdzv-conf CONF", String, "Additional rendezvous config (k=v,k2=v2)") do |value| + options[:rdzv_conf] = parse_kv_pairs(value) + end + + parser.on("--standalone", "Start a local rendezvous store on a free port") do + options[:standalone] = true + end + + parser.on("--max-restarts VALUE", Integer, "Restarts before failing. Default: #{options[:max_restarts]}") do |value| + options[:max_restarts] = value + end + + parser.on("--monitor-interval SECONDS", Float, "Delay between restart attempts. Default: #{options[:monitor_interval]}") do |value| + options[:monitor_interval] = value + end + + parser.on("--role NAME", String, "Role for the worker group. Default: #{options[:role]}") do |value| + options[:role] = value + end + + parser.on("--master-addr HOST", String, "Master address for static rendezvous. Default: #{options[:master_addr]}") do |value| + options[:master_addr] = value + end + + parser.on("--master-port PORT", Integer, "Master port for static rendezvous. Default: #{options[:master_port]}") do |value| + options[:master_port] = value + end + + parser.on("--pass-local-rank-arg", "Append --local-rank to the training script invocation") do + options[:pass_local_rank_arg] = true + end + + parser.on("--no-ruby", "Execute the training script directly instead of `#{RbConfig.ruby}`") do + options[:no_ruby] = true + end + + parser.on("-h", "--help", "Prints this help") do + puts parser + exit + end + + rest = parser.parse!(argv) + raise OptionParser::MissingArgument, "training_script" if rest.empty? + + training_script = rest.shift + [options, training_script, rest] + end + + def to_s + parser.to_s + end + + private + + def default_options + { + nnodes: "1:1", + nproc_per_node: "1", + node_rank: 0, + rdzv_backend: "static", + rdzv_endpoint: "", + rdzv_id: "none", + rdzv_conf: {}, + standalone: false, + max_restarts: 0, + monitor_interval: 1.0, + role: "default", + master_addr: "127.0.0.1", + master_port: 29_500, + pass_local_rank_arg: false, + no_ruby: false + } + end + + def parse_kv_pairs(value) + return {} if value.nil? || value.strip.empty? + + value.split(",").each_with_object({}) do |pair, acc| + key, val = pair.split("=", 2) + raise OptionParser::InvalidArgument, "Invalid rendezvous config entry: #{pair.inspect}" unless key && val + + acc[key.strip] = val.strip + end + end + end + + module_function + + def start(argv, out: $stdout, err: $stderr) + parser = Parser.new + options, script, script_args = parser.parse(argv) + status = Launcher.new(options, script, script_args, out: out, err: err).run + exit(status) + rescue OptionParser::ParseError => e + err.puts(e.message) + err.puts(parser) + exit(2) + rescue Error => e + err.puts("torchrun: #{e.message}") + exit(1) + end + + class Launcher + def initialize(options, script, script_args, out: $stdout, err: $stderr) + @options = options + @script = script + @script_args = script_args + @out = out + @err = err + + @local_world_size = determine_local_world_size(@options[:nproc_per_node]) + @min_nodes, @max_nodes = parse_nnodes(@options[:nnodes]) + @num_nodes = ensure_fixed_nnodes(@min_nodes, @max_nodes) + @node_rank = @options[:node_rank] + @max_restarts = [@options[:max_restarts], 0].max + @monitor_interval = [@options[:monitor_interval], 0.0].max + @role = @options[:role] + @pass_local_rank_arg = @options[:pass_local_rank_arg] + @no_ruby = @options[:no_ruby] + validate_node_rank! + + setup_rendezvous! + end + + def run + restarts = 0 + + loop do + status = launch_worker_group(restarts) + return status if status.zero? || @signal_received + return status if restarts >= @max_restarts + + restarts += 1 + log("Worker group failed (exit #{status}). Restarting #{restarts}/#{@max_restarts} ...") + sleep(@monitor_interval) if @monitor_interval.positive? + end + end + + private + + def launch_worker_group(restart_count) + @signal_received = nil + @current_pids = spawn_workers(restart_count) + handler_state = setup_signal_handlers + status = monitor_workers(@current_pids.dup) + cleanup_workers(@current_pids) + restore_signal_handlers(handler_state) + return signal_exit_status if @signal_received + + status + ensure + @current_pids = [] + end + + def spawn_workers(restart_count) + base_env = base_environment(restart_count) + Array.new(@local_world_size) do |local_rank| + env = base_env.merge(rank_environment(local_rank)) + spawn_worker(env, local_rank) + end + end + + def spawn_worker(env, local_rank) + args = command_arguments(local_rank) + Process.spawn(env, *args) + rescue SystemCallError => e + raise Error, "failed to launch worker #{local_rank}: #{e.message}" + end + + def command_arguments(local_rank) + cmd = [] + if @no_ruby + cmd << @script + else + cmd << RbConfig.ruby + cmd << @script + end + cmd.concat(@script_args) + cmd << "--local-rank=#{local_rank}" if @pass_local_rank_arg + cmd + end + + def base_environment(restart_count) + endpoint = "#{@master_addr}:#{@master_port}" + env = { + "MASTER_ADDR" => @master_addr, + "MASTER_PORT" => @master_port.to_s, + "WORLD_SIZE" => world_size.to_s, + "LOCAL_WORLD_SIZE" => @local_world_size.to_s, + "GROUP_RANK" => @node_rank.to_s, + "TORCHRUN_ROLE" => @role, + "TORCHRUN_NNODES" => @num_nodes.to_s, + "TORCHRUN_NPROC_PER_NODE" => @local_world_size.to_s, + "TORCHELASTIC_RUN_ID" => @rdzv_id, + "TORCHRUN_RDZV_BACKEND" => @rdzv_backend, + "TORCHRUN_RDZV_ENDPOINT" => endpoint, + "TORCHELASTIC_RESTART_COUNT" => restart_count.to_s, + "TORCHRUN_STANDALONE" => @standalone ? "1" : "0" + } + unless @rdzv_conf.empty? + env["TORCHRUN_RDZV_CONF"] = @rdzv_conf.map { |k, v| "#{k}=#{v}" }.join(",") + end + ENV.to_h.merge(env) + end + + def rank_environment(local_rank) + rank = @node_rank * @local_world_size + local_rank + { + "LOCAL_RANK" => local_rank.to_s, + "RANK" => rank.to_s + } + end + + def monitor_workers(pids) + exit_code = 0 + remaining = pids.dup + until remaining.empty? + pid, status = Process.wait2 + next unless pid + + remaining.delete(pid) + unless status.success? + exit_code = exit_status_from(status) + terminate_workers(remaining) + break + end + end + exit_code + rescue Errno::ECHILD + 0 + end + + def terminate_workers(pids) + return if pids.empty? + + pids.each { |pid| send_signal(pid, "TERM") } + sleep(0.2) + pids.each do |pid| + next unless process_alive?(pid) + + send_signal(pid, "KILL") + end + pids.each do |pid| + begin + Process.wait(pid) + rescue Errno::ECHILD + end + end + end + + def process_alive?(pid) + Process.kill(0, pid) + true + rescue Errno::ESRCH + false + end + + def setup_signal_handlers + SIGNALS.each_with_object({}) do |sig, acc| + next unless Signal.list.key?(sig) + + previous = Signal.trap(sig) do + @signal_received = sig + forward_signal(sig) + end + acc[sig] = previous + end + end + + def forward_signal(sig) + (@current_pids || []).each { |pid| send_signal(pid, sig) } + end + + def restore_signal_handlers(state) + return unless state + + state.each do |sig, previous| + Signal.trap(sig, previous) + end + end + + def send_signal(pid, sig) + Process.kill(sig, pid) + rescue Errno::ESRCH + nil + end + + def cleanup_workers(pids) + pids.each do |pid| + next unless process_alive?(pid) + + begin + Process.wait(pid) + rescue Errno::ECHILD + end + end + end + + def signal_exit_status + return 0 unless @signal_received + + 128 + Signal.list.fetch(@signal_received, 0) + end + + def exit_status_from(status) + if status.exited? + status.exitstatus + elsif status.signaled? + 128 + status.termsig + else + 1 + end + end + + def determine_local_world_size(value) + spec = value.to_s.strip.downcase + case spec + when "", "1" + 1 + when /\A\d+\z/ + amount = spec.to_i + raise Error, "nproc-per-node must be >= 1" if amount < 1 + + amount + when "gpu" + gpu_count = cuda_device_count + raise Error, "CUDA is not available for --nproc-per-node=gpu" if gpu_count.zero? + + gpu_count + when "auto" + gpu_count = cuda_device_count + return gpu_count if gpu_count.positive? + + cpu_count + when "cpu" + cpu_count + else + raise Error, "Unsupported --nproc-per-node value: #{value}" + end + end + + def cuda_device_count + return 0 unless defined?(Torch::CUDA) + return 0 unless Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available? + return 0 unless Torch::CUDA.respond_to?(:device_count) + + Torch::CUDA.device_count + rescue StandardError + 0 + end + + def cpu_count + Etc.respond_to?(:nprocessors) ? (Etc.nprocessors || 1) : 1 + rescue StandardError + 1 + end + + def parse_nnodes(value) + parts = value.split(":") + nums = parts.map do |part| + Integer(part, exception: false) + end + raise Error, "Invalid --nnodes value: #{value.inspect}" if nums.any?(&:nil?) + + if nums.length == 1 + [nums.first, nums.first] + elsif nums.length == 2 + [nums.first, nums.last] + else + raise Error, "Invalid --nnodes value: #{value.inspect}" + end + end + + def ensure_fixed_nnodes(min_nodes, max_nodes) + raise Error, "--nnodes minimum must be >= 1" if min_nodes < 1 + raise Error, "--nnodes maximum must be >= minimum" if max_nodes < min_nodes + raise Error, "Elastic nnodes ranges are not supported yet (got #{min_nodes}:#{max_nodes})" if min_nodes != max_nodes + + min_nodes + end + + def world_size + @world_size ||= @num_nodes * @local_world_size + end + + def validate_node_rank! + raise Error, "--node-rank must be >= 0" if @node_rank.negative? + raise Error, "--node-rank (#{@node_rank}) must be less than --nnodes (#{@num_nodes})" if @node_rank >= @num_nodes + end + + def setup_rendezvous! + @rdzv_backend = normalize_backend(@options[:rdzv_backend]) + @rdzv_conf = @options[:rdzv_conf] || {} + if @options[:standalone] + configure_standalone_rendezvous + else + configure_static_rendezvous + end + end + + def normalize_backend(value) + backend = value.to_s.downcase + raise Error, "Unsupported rendezvous backend: #{value.inspect}" unless %w[static c10d].include?(backend) + + backend + end + + def configure_standalone_rendezvous + @standalone = true + @rdzv_backend = "c10d" + @rdzv_id = SecureRandom.uuid + @master_addr = "127.0.0.1" + @master_port = find_free_port(@master_addr) + log(<<~MSG) + + ************************************** + Rendezvous info: + --rdzv-backend=#{@rdzv_backend} + --rdzv-endpoint=#{@master_addr}:#{@master_port} + --rdzv-id=#{@rdzv_id} + ************************************** + + MSG + end + + def configure_static_rendezvous + @standalone = false + endpoint_host, endpoint_port = parse_endpoint(@options[:rdzv_endpoint]) + @master_addr = endpoint_host || @options[:master_addr] + @master_port = endpoint_port || @options[:master_port] + @rdzv_id = @options[:rdzv_id] + raise Error, "MASTER_ADDR must be provided" if @master_addr.to_s.empty? + raise Error, "MASTER_PORT must be > 0" unless @master_port.to_i.positive? + end + + def parse_endpoint(value) + return [nil, nil] if value.nil? || value.strip.empty? + + host, port_str = value.split(":", 2) + port = port_str ? Integer(port_str, exception: false) : nil + raise Error, "Invalid rendezvous endpoint: #{value.inspect}" if host.to_s.empty? || (port_str && port.nil?) + + [host, port] + end + + def find_free_port(host) + server = TCPServer.new(host, 0) + server.addr[1] + ensure + server&.close + end + + def log(message) + @out.puts(message) + end + end + end +end diff --git a/test/distributed_test.rb b/test/distributed_test.rb new file mode 100644 index 00000000..487f3af0 --- /dev/null +++ b/test/distributed_test.rb @@ -0,0 +1,77 @@ +require_relative "test_helper" +require "socket" + +class DistributedTest < Minitest::Test + def setup + super + skip "Distributed backend not available" unless Torch::Distributed.available? + end + + def test_all_reduce + results = Torch::Distributed.fork_world(2) do |rank, port| + store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?) + Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2) + + tensor = Torch.tensor([rank + 1.0]) + Torch::Distributed.all_reduce(tensor) + Torch::Distributed.destroy_process_group + tensor.to_a + end + + assert_equal [[3.0], [3.0]], results + end + + def test_barrier + wait_times = Torch::Distributed.fork_world(2) do |rank, port| + store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?) + Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2) + + sleep 0.3 if rank.zero? + before = Process.clock_gettime(Process::CLOCK_MONOTONIC) + Torch::Distributed.barrier + after = Process.clock_gettime(Process::CLOCK_MONOTONIC) + Torch::Distributed.destroy_process_group + after - before + end + + assert_operator wait_times.first, :<, 0.1 + assert_operator wait_times.last, :>=, 0.25 + end + + def test_broadcast + tensors = Torch::Distributed.fork_world(2) do |rank, port| + store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?) + Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2) + + tensor = Torch.tensor([rank + 1.0]) + Torch::Distributed.broadcast(tensor, src: 0) + Torch::Distributed.destroy_process_group + tensor.to_a + end + + assert_equal [[1.0], [1.0]], tensors + end + + def test_ddp_gradient_sync + grads = Torch::Distributed.fork_world(2) do |rank, port| + store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?) + Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2) + + model = Torch::NN::Linear.new(1, 1, bias: false) + ddp = Torch::NN::Parallel::DistributedDataParallel.new(model) + input = Torch.tensor([[rank + 1.0]]) + output = ddp.call(input) + loss = output.sum + loss.backward + + grad = model.parameters.first.grad.item + Torch::Distributed.destroy_process_group + grad + end + + grads.each do |grad| + assert_in_delta 1.5, grad, 1e-6 + end + end + +end diff --git a/test/save_test.rb b/test/save_test.rb index a7438e03..640fdf25 100644 --- a/test/save_test.rb +++ b/test/save_test.rb @@ -55,6 +55,61 @@ def test_load_missing assert_equal "No such file or directory @ rb_sysopen - missing.bin", error.message end + def test_load_with_map_location_string + tmpfile = Tempfile.new + tensor = Torch.tensor([1, 2, 3]) + Torch.save(tensor, tmpfile.path) + loaded = Torch.load(tmpfile.path, map_location: "cpu") + assert_equal tensor.to_a, loaded.to_a + end + + def test_load_with_map_location_callable + tmpfile = Tempfile.new + tensor = Torch.tensor([1, 2, 3]) + Torch.save(tensor, tmpfile.path) + seen = [] + loaded = Torch.load(tmpfile.path, map_location: lambda { |value, loc| + seen << loc + value + }) + assert_equal tensor.to_a, loaded.to_a + assert_equal ["cpu"], seen + end + + def test_load_with_weights_only + tmpfile = Tempfile.new + tensor = Torch.tensor([1, 2, 3]) + Torch.save(tensor, tmpfile.path) + loaded = Torch.load(tmpfile.path, weights_only: true) + assert_equal tensor.to_a, loaded.to_a + end + + def test_load_map_location_cuda_to_cpu + skip "Requires CUDA" unless Torch::CUDA.available? + + tmpfile = Tempfile.new + tensor = Torch.tensor([1, 2, 3]).cuda + Torch.save(tensor, tmpfile.path) + + loaded = Torch.load(tmpfile.path, map_location: "cpu") + assert_equal "cpu", loaded.device.type + assert_equal tensor.cpu.to_a, loaded.to_a + end + + def test_load_map_location_cpu_to_cuda + skip "Requires CUDA" unless Torch::CUDA.available? + + tmpfile = Tempfile.new + tensor = Torch.tensor([1, 2, 3]) + Torch.save(tensor, tmpfile.path) + + device = "cuda:0" + loaded = Torch.load(tmpfile.path, map_location: device) + assert_equal "cuda", loaded.device.type + assert_equal 0, loaded.device.index + assert_equal tensor.to_a, loaded.cpu.to_a + end + private def assert_save(obj) diff --git a/test/support/scripts/show_ranks.rb b/test/support/scripts/show_ranks.rb new file mode 100644 index 00000000..6654dfcb --- /dev/null +++ b/test/support/scripts/show_ranks.rb @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +$stdout.sync = true +rank = ENV.fetch("RANK", "unknown") +local_rank = ENV.fetch("LOCAL_RANK", "unknown") +world_size = ENV.fetch("WORLD_SIZE", "unknown") +puts "RANK=#{rank} LOCAL_RANK=#{local_rank} WORLD_SIZE=#{world_size}" diff --git a/test/torchrun_test.rb b/test/torchrun_test.rb new file mode 100644 index 00000000..a3cf7a38 --- /dev/null +++ b/test/torchrun_test.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +require_relative "test_helper" + +require "open3" +require "rbconfig" + +class TorchRunTest < Minitest::Test + def test_standalone_launches_multiple_workers + script = File.expand_path("support/scripts/show_ranks.rb", __dir__) + torchrun = File.expand_path("../bin/torchrun", __dir__) + stdout, stderr, status = Open3.capture3( + {"TORCHRUN_TEST" => "1"}, + RbConfig.ruby, + torchrun, + "--standalone", + "--nproc-per-node=2", + script + ) + + assert status.success?, "torchrun failed: #{stderr}" + + lines = stdout.lines.map(&:strip).select { |line| line.start_with?("RANK=") } + assert_equal 2, lines.size, "expected two worker outputs, got: #{lines.inspect}" + ranks = lines.map do |line| + match = line.match(/RANK=(\d+)\s+LOCAL_RANK=(\d+)\s+WORLD_SIZE=(\d+)/) + raise "unexpected output: #{line}" unless match + + [match[1].to_i, match[2].to_i, match[3].to_i] + end + assert_equal [[0, 0, 2], [1, 1, 2]], ranks.sort + end +end diff --git a/torch-rb.gemspec b/torch-rb.gemspec index 0adcc03b..40c89325 100644 --- a/torch-rb.gemspec +++ b/torch-rb.gemspec @@ -10,7 +10,9 @@ Gem::Specification.new do |spec| spec.author = "Andrew Kane" spec.email = "andrew@ankane.org" - spec.files = Dir["*.{md,txt}", "{codegen,ext,lib}/**/*"] + spec.files = Dir["*.{md,txt}", "{codegen,ext,lib}/**/*", "bin/*"] + spec.executables = Dir["bin/*"].map { |file| File.basename(file) } + spec.bindir = "bin" spec.require_path = "lib" spec.extensions = ["ext/torch/extconf.rb"] From 20e7845f0ed66eb2212327a38d4f02f9cefbfddd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Sat, 15 Nov 2025 10:39:32 +0300 Subject: [PATCH 19/28] Updated distributed example --- examples/mnist/distributed.rb | 79 +++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 22 deletions(-) diff --git a/examples/mnist/distributed.rb b/examples/mnist/distributed.rb index 531b0f87..91b6d52c 100644 --- a/examples/mnist/distributed.rb +++ b/examples/mnist/distributed.rb @@ -5,12 +5,14 @@ require "optparse" require "torch" require "torchvision" -require "socket" +require "tmpdir" unless Torch::Distributed.available? abort "torch.distributed was not built in this binary" end +DEFAULT_CHECKPOINT_PATH = File.join(Dir.tmpdir, "mnist_ddp_checkpoint.pt") + class MyNet < Torch::NN::Module def initialize super() @@ -43,7 +45,9 @@ def parse_options backend: "gloo", gpus: Torch::CUDA.available? ? [Torch::CUDA.device_count, 1].max : 1, log_interval: 20, - data_dir: File.join(__dir__, "data") + data_dir: File.join(__dir__, "data"), + checkpoint_path: DEFAULT_CHECKPOINT_PATH, + resume: false } OptionParser.new do |opts| @@ -56,28 +60,13 @@ def parse_options opts.on("--gpus N", Integer, "Number of GPUs/processes to use") { |v| defaults[:gpus] = v } opts.on("--log-interval N", Integer, "Batches between log statements") { |v| defaults[:log_interval] = v } opts.on("--data-dir PATH", String, "Directory for cached MNIST data") { |v| defaults[:data_dir] = v } + opts.on("--checkpoint PATH", String, "Checkpoint file to save to (default: #{defaults[:checkpoint_path]})") { |v| defaults[:checkpoint_path] = v } + opts.on("--resume", "Load checkpoint weights before training if the file exists") { defaults[:resume] = true } end.parse!(ARGV) defaults end -def free_port - server = TCPServer.new("127.0.0.1", 0) - port = server.addr[1] - server.close - port -end - -def spawn_workers(world_size) - port = free_port - - world_size.times.map do |rank| - fork do - yield(rank, world_size, port) - end - end.each { Process.wait2(_1) } -end - def load_datasets(rank, data_dir) transforms = TorchVision::Transforms::Compose.new([ TorchVision::Transforms::ToTensor.new, @@ -102,6 +91,39 @@ def subset_for_rank(dataset, rank, world_size) Torch::Utils::Data::Subset.new(dataset, indices) end +def checkpoint_map_location(device, rank) + accelerator_device = Torch::Accelerator.current_accelerator + return nil unless accelerator_device + + accelerator_type = accelerator_device.type + target_index = device.index + if target_index.nil? && Torch::Accelerator.respond_to?(:device_count) + count = Torch::Accelerator.device_count + target_index = count.positive? ? rank % count : 0 + end + { "#{accelerator_type}:0" => "#{accelerator_type}:#{target_index}" } +end + +def load_checkpoint_if_present(ddp, device, rank, path) + return false unless path && File.exist?(path) + + Torch::Distributed.barrier + kwargs = { weights_only: true } + map_location = checkpoint_map_location(device, rank) + kwargs[:map_location] = map_location if map_location + state_dict = Torch.load(path, **kwargs) + ddp.module.load_state_dict(state_dict) + true +end + +def save_checkpoint(ddp, path, rank) + return unless path + + Torch.save(ddp.module.state_dict, path) if rank.zero? + Torch::Distributed.barrier + puts "Saved checkpoint to #{path}" if rank.zero? +end + def train_epoch(model, device, loader, optimizer, epoch, rank, log_interval) model.train loader.each_with_index do |(data, target), batch_idx| @@ -163,6 +185,18 @@ def run_worker(rank, world_size, port, options) train_subset = subset_for_rank(train_dataset, rank, world_size) train_loader = Torch::Utils::Data::DataLoader.new(train_subset, batch_size: options[:batch_size], shuffle: true) test_loader = Torch::Utils::Data::DataLoader.new(test_dataset, batch_size: options[:batch_size], shuffle: false) if rank.zero? + checkpoint_path = options[:checkpoint_path] + + if options[:resume] + loaded = load_checkpoint_if_present(ddp, device, rank, checkpoint_path) + if rank.zero? + if loaded + puts "Loaded checkpoint weights from #{checkpoint_path}" + else + puts "No checkpoint found at #{checkpoint_path}, starting from random initialization" + end + end + end options[:epochs].times do |epoch_idx| epoch = epoch_idx + 1 @@ -170,6 +204,7 @@ def run_worker(rank, world_size, port, options) if rank.zero? evaluate(ddp.module, device, test_loader) end + save_checkpoint(ddp, checkpoint_path, rank) if checkpoint_path end Torch::Distributed.destroy_process_group @@ -190,9 +225,9 @@ def run_worker(rank, world_size, port, options) Torch.manual_seed(1) if world_size == 1 - run_worker(0, 1, free_port, options) + run_worker(0, 1, Torch::Distributed.free_port, options) else - spawn_workers(world_size) do |rank, total, port| - run_worker(rank, total, port, options) + Torch::Distributed.fork_world(world_size) do |rank, port| + run_worker(rank, world_size, port, options) end end From 52282cb32bf0b91b15aed2e941b2d0a809bb8242 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Mon, 17 Nov 2025 12:25:09 +0300 Subject: [PATCH 20/28] inter-device map_location fixed --- ext/torch/torch.cpp | 30 ++++++++++++++++++++++++++++++ lib/torch.rb | 33 ++++++++++++++++++++++++++++++++- lib/torch/device.rb | 16 ++++++++++++++++ lib/torch/tensor.rb | 3 +-- 4 files changed, 79 insertions(+), 3 deletions(-) diff --git a/ext/torch/torch.cpp b/ext/torch/torch.cpp index 20e4c4d4..ded002af 100644 --- a/ext/torch/torch.cpp +++ b/ext/torch/torch.cpp @@ -1,8 +1,14 @@ #include +#include +#include #include #include #include +#include +#include + +#include #include #include @@ -76,6 +82,30 @@ void init_torch(Rice::Module& m) { input.close(); return torch::pickle_load(bytes); }) + .define_singleton_function( + "_load_with_device", + [](const std::string &filename, const std::string &device_str) { + std::ifstream input(filename, std::ios::binary); + std::vector bytes( + (std::istreambuf_iterator(input)), + (std::istreambuf_iterator())); + input.close(); + + auto device = c10::Device(device_str); + auto reader = std::make_shared( + bytes.data(), + static_cast(bytes.size())); + caffe2::serialize::PyTorchStreamReader stream_reader(reader); + + return torch::jit::readArchiveAndTensors( + "data", + /*pickle_prefix=*/"", + /*tensor_prefix=*/"", + /*type_resolver=*/std::nullopt, + /*obj_loader=*/std::nullopt, + /*device=*/device, + stream_reader); + }) .define_singleton_function( "_from_blob", [](Rice::String s, const std::vector &size, const torch::TensorOptions &options) { diff --git a/lib/torch.rb b/lib/torch.rb index ce213e1d..dd652872 100644 --- a/lib/torch.rb +++ b/lib/torch.rb @@ -406,7 +406,19 @@ def load(filename, map_location: nil, weights_only: false) # keep backwards compatibility File.open(filename, "rb") { |f| f.read(1) } - result = to_ruby(_load(filename)) + load_device = map_location_device(map_location) if map_location + result = + if load_device + device_str = + if load_device.respond_to?(:_str) + load_device._str + else + load_device.to_s + end + to_ruby(_load_with_device(filename, device_str)) + else + to_ruby(_load(filename)) + end ensure_weights_only_contents!(result) if weights_only result = apply_map_location(result, map_location) if map_location result @@ -570,6 +582,25 @@ def ensure_weights_only_contents!(obj) end end + def map_location_device(map_location) + case map_location + when Device, String, Symbol + normalize_map_location_device(map_location) + when Hash + devices = + map_location.values.map do |value| + normalize_map_location_device(value) + rescue StandardError + nil + end.compact + return nil if devices.empty? + devices.uniq! + devices.one? ? devices.first : nil + else + nil + end + end + def apply_map_location(obj, map_location) case obj when Tensor diff --git a/lib/torch/device.rb b/lib/torch/device.rb index 45a822a8..f80868ff 100644 --- a/lib/torch/device.rb +++ b/lib/torch/device.rb @@ -22,4 +22,20 @@ def hash [type, index].hash end end + + # String-like wrapper that also exposes device metadata + class DeviceString < String + def initialize(device) + @device = device + super(device._str) + end + + def type + @device.type + end + + def index + @device.index + end + end end diff --git a/lib/torch/tensor.rb b/lib/torch/tensor.rb index ed8ab71e..318f14e0 100644 --- a/lib/torch/tensor.rb +++ b/lib/torch/tensor.rb @@ -211,9 +211,8 @@ def coerce(other) end end - # TODO return Device instead of String in 0.19.0 def device - _device._str + DeviceString.new(_device) end end end From cac2534a771f9663ddea9ad06fcc8a97d1e27f11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Mon, 17 Nov 2025 13:02:57 +0300 Subject: [PATCH 21/28] autodetecting libtorch distributed support --- ext/torch/extconf.rb | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/ext/torch/extconf.rb b/ext/torch/extconf.rb index cf3c6706..0a8250b0 100644 --- a/ext/torch/extconf.rb +++ b/ext/torch/extconf.rb @@ -70,6 +70,45 @@ $LDFLAGS += " -Wl,--no-as-needed,#{lib}/libtorch.so" end +supports_c10d = try_link(<<~CPP, "-DUSE_C10D") + #include + #include + + int main() { + ::c10d::FileStore store("unused", 1); + return 0; + } +CPP + +supports_c10d_gloo = supports_c10d && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_GLOO") + #include + #include + #include + + int main() { + auto store = c10::make_intrusive<::c10d::FileStore>("unused", 1); + auto opts = ::c10d::ProcessGroupGloo::Options::create(); + opts->devices.push_back(::c10d::ProcessGroupGloo::createDefaultDevice()); + ::c10d::ProcessGroupGloo pg(store, 0, 1, opts); + return static_cast(pg.getRank()); + } +CPP + +supports_c10d_nccl = with_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL") + #include + #include + + int main() { + auto opts = c10::make_intrusive<::c10d::ProcessGroupNCCL::Options>(); + opts->is_high_priority_stream = false; + return 0; + } +CPP + +$defs << "-DUSE_C10D" if supports_c10d +$defs << "-DUSE_C10D_GLOO" if supports_c10d_gloo +$defs << "-DUSE_C10D_NCCL" if supports_c10d_nccl + # generate C++ functions puts "Generating C++ functions..." require_relative "../../codegen/generate_functions" From d8dc2953e4cbf995d1f2811ad56eebbc4649f113 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Wed, 19 Nov 2025 13:21:03 +0300 Subject: [PATCH 22/28] DDP fixes and improvements --- README.md | 11 + examples/benchmark/training.rb | 207 +++++++++++ examples/mnist/distributed.rb | 11 +- ext/torch/cuda.cpp | 9 +- ext/torch/distributed.cpp | 98 ++++- ext/torch/extconf.rb | 22 +- ext/torch/tensor.cpp | 14 +- lib/torch/distributed.rb | 338 +++++++++++++++--- .../nn/parallel/distributed_data_parallel.rb | 20 +- lib/torch/tensor.rb | 3 +- lib/torch/torchrun.rb | 26 +- test/distributed_test.rb | 193 ++++++++-- test/test_helper.rb | 26 ++ 13 files changed, 880 insertions(+), 98 deletions(-) create mode 100644 examples/benchmark/training.rb diff --git a/README.md b/README.md index d477c8ca..78934193 100644 --- a/README.md +++ b/README.md @@ -56,9 +56,18 @@ A good place to start is [Deep Learning with Torch.rb: A 60 Minute Blitz](tutori - [Image classification with MNIST](examples/mnist) ([日本語版](https://qiita.com/kojix2/items/c19c36dc1bf73ea93409)) - [Distributed MNIST training](examples/mnist/distributed.rb) +- [Training benchmarks (variable batch size / GPU count)](examples/benchmark/training.rb) - [Collaborative filtering with MovieLens](examples/movielens) - [Generative adversarial networks](examples/gan) +Run the benchmark with: + +```sh +bundle exec ruby examples/benchmark/training.rb --arch mnist_cnn --batch-size 256 --gpus 1 --steps 50 +``` + +Set `--gpus` to 2+ to enable distributed training; `--steps` measures only timed steps and `--warmup` sets warmup iterations. + ## Distributed Training Torch.rb ships with a `torchrun` launcher that mirrors the PyTorch CLI. It handles process orchestration and sets the `RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, and `MASTER_PORT` environment variables expected by `Torch::Distributed.init_process_group`. @@ -84,6 +93,8 @@ bundle exec torchrun \ On node 1, change `--node-rank=1`. The launcher restarts workers up to `--max-restarts` times and can be combined with tools like `bundle exec` or custom scripts via `--no-ruby`. +For scripts that use the `Torch::Distributed.fork_world` helper directly, set `start_method: :spawn` to launch fresh worker processes instead of forking. This matches Python’s multiprocessing start methods and avoids CUDA fork issues. + ## API This library follows the [PyTorch API](https://pytorch.org/docs/stable/torch.html). There are a few changes to make it more Ruby-like: diff --git a/examples/benchmark/training.rb b/examples/benchmark/training.rb new file mode 100644 index 00000000..090018f6 --- /dev/null +++ b/examples/benchmark/training.rb @@ -0,0 +1,207 @@ +# Benchmark training throughput for common architectures/datasets. +# Usage examples: +# ruby examples/benchmark/training.rb --arch mnist_cnn --batch-size 128 --gpus 1 +# ruby examples/benchmark/training.rb --arch mnist_cnn --batch-size 128 --gpus 2 --steps 50 + +require "bundler/setup" +require "optparse" +require "torch" +require "torchvision" + +DEFAULT_BACKEND = if Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available? + "nccl" +else + Torch::Distributed.get_default_backend_for_device(Torch::Accelerator.current_accelerator) || "gloo" +end + +class MnistCnn < Torch::NN::Module + def initialize + super() + @conv1 = Torch::NN::Conv2d.new(1, 32, 3, stride: 1) + @conv2 = Torch::NN::Conv2d.new(32, 64, 3, stride: 1) + @dropout1 = Torch::NN::Dropout2d.new(p: 0.25) + @dropout2 = Torch::NN::Dropout2d.new(p: 0.5) + @fc1 = Torch::NN::Linear.new(9216, 128) + @fc2 = Torch::NN::Linear.new(128, 10) + end + + def forward(x) + x = Torch::NN::F.relu(@conv1.call(x)) + x = Torch::NN::F.relu(@conv2.call(x)) + x = Torch::NN::F.max_pool2d(x, 2) + x = @dropout1.call(x) + x = Torch.flatten(x, start_dim: 1) + x = Torch::NN::F.relu(@fc1.call(x)) + x = @dropout2.call(x) + Torch::NN::F.log_softmax(@fc2.call(x), 1) + end +end + +ARCH_CONFIGS = { + "mnist_cnn" => { + model: -> { MnistCnn.new }, + dataset: :mnist + } +}.freeze + +def parse_options + defaults = { + arch: "mnist_cnn", + batch_size: 128, + steps: 100, + warmup: 10, + backend: DEFAULT_BACKEND, + gpus: Torch::CUDA.available? ? [Torch::CUDA.device_count, 1].max : 1, + data_dir: File.join(__dir__, "data"), + lr: 0.01 + } + + OptionParser.new do |opts| + opts.banner = "Usage: ruby examples/benchmark/training.rb [options]" + opts.on("--arch NAME", "Architecture to benchmark (#{ARCH_CONFIGS.keys.join(', ')}, default: #{defaults[:arch]})") { |v| defaults[:arch] = v } + opts.on("--batch-size N", Integer, "Batch size per process (default: #{defaults[:batch_size]})") { |v| defaults[:batch_size] = v } + opts.on("--steps N", Integer, "Number of timed training steps (default: #{defaults[:steps]})") { |v| defaults[:steps] = v } + opts.on("--warmup N", Integer, "Number of warmup steps not included in timing (default: #{defaults[:warmup]})") { |v| defaults[:warmup] = v } + opts.on("--backend NAME", String, "Process group backend (default: #{defaults[:backend]})") { |v| defaults[:backend] = v } + opts.on("--gpus N", Integer, "Number of GPUs/processes to use (1 for non-distributed)") { |v| defaults[:gpus] = v } + opts.on("--data-dir PATH", String, "Directory for cached datasets (default: #{defaults[:data_dir]})") { |v| defaults[:data_dir] = v } + opts.on("--lr FLOAT", Float, "Learning rate (default: #{defaults[:lr]})") { |v| defaults[:lr] = v } + end.parse!(ARGV) + + defaults +end + +def dataset_for(name, data_dir, distributed:, rank:, world_size:) + case name + when :mnist + transforms = TorchVision::Transforms::Compose.new([ + TorchVision::Transforms::ToTensor.new, + TorchVision::Transforms::Normalize.new([0.1307], [0.3081]) + ]) + + if distributed + if rank.zero? + train = TorchVision::Datasets::MNIST.new(data_dir, train: true, download: true, transform: transforms) + Torch::Distributed.barrier + else + Torch::Distributed.barrier + train = TorchVision::Datasets::MNIST.new(data_dir, train: true, download: false, transform: transforms) + end + indices = rank.step(train.size - 1, world_size).to_a + Torch::Utils::Data::Subset.new(train, indices) + else + TorchVision::Datasets::MNIST.new(data_dir, train: true, download: true, transform: transforms) + end + else + raise ArgumentError, "Unknown dataset: #{name}" + end +end + +def sync_cuda_if_needed(device) + return unless device && device.type == "cuda" + return unless Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:synchronize) + + Torch::CUDA.synchronize +end + +def benchmark_worker(rank, world_size, port, options) + arch = options.fetch(:arch) + config = ARCH_CONFIGS[arch] + raise ArgumentError, "Unsupported architecture #{arch.inspect}" unless config + + distributed = world_size > 1 + if distributed + store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?) + accelerator = Torch::Accelerator.current_accelerator + backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator) || DEFAULT_BACKEND + Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size) + end + + device = if Torch::CUDA.available? && options[:gpus] > 0 + Torch.device("cuda:#{rank % Torch::CUDA.device_count}") + else + Torch.device("cpu") + end + + model = config[:model].call.to(device) + if distributed + ddp_devices = device.type == "cuda" ? [device.index] : nil + model = Torch::NN::Parallel::DistributedDataParallel.new(model, device_ids: ddp_devices) + end + optimizer = Torch::Optim::SGD.new(model.parameters, lr: options[:lr]) + + loader = Torch::Utils::Data::DataLoader.new( + dataset_for(config[:dataset], options[:data_dir], distributed: distributed, rank: rank, world_size: world_size), + batch_size: options[:batch_size], + shuffle: true + ) + + warmup_steps = options[:warmup] + timed_steps = options[:steps] + total_steps = warmup_steps + timed_steps + + step_idx = 0 + loader.each do |data, target| + data = data.to(device) + target = target.to(device) + + optimizer.zero_grad + loss = Torch::NN::F.nll_loss(model.call(data), target) + loss.backward + optimizer.step + + step_idx += 1 + break if step_idx >= total_steps + end + + sync_cuda_if_needed(device) + Torch::Distributed.barrier if distributed + + timed = 0 + step_idx = 0 + start = Process.clock_gettime(Process::CLOCK_MONOTONIC) + loader.each do |data, target| + data = data.to(device) + target = target.to(device) + + optimizer.zero_grad + loss = Torch::NN::F.nll_loss(model.call(data), target) + loss.backward + optimizer.step + + step_idx += 1 + break if step_idx >= timed_steps + end + + sync_cuda_if_needed(device) + Torch::Distributed.barrier if distributed + elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start + timed = step_idx + + if rank.zero? + images = timed * options[:batch_size] * world_size + puts "Architecture: #{arch}" + puts "Dataset: #{config[:dataset]}" + puts "GPUs: #{world_size}" + puts "Batch size per process: #{options[:batch_size]}" + puts "Timed steps: #{timed}" + puts "Total images: #{images}" + puts format("Elapsed: %.3fs | Throughput: %.1f images/s", elapsed, images / elapsed) + end + + Torch::Distributed.destroy_process_group if distributed +end + +options = parse_options +world_size = options[:gpus] +raise "Number of GPUs requested must be >= 1" if world_size < 1 +Torch.manual_seed(1) + +if world_size > 1 + raise "torch.distributed is not available" unless Torch::Distributed.available? + Torch::Distributed.fork_world(world_size, start_method: :spawn) do |rank, port| + benchmark_worker(rank, world_size, port, options) + end +else + benchmark_worker(0, 1, Torch::Distributed.free_port, options) +end diff --git a/examples/mnist/distributed.rb b/examples/mnist/distributed.rb index 91b6d52c..d300ead0 100644 --- a/examples/mnist/distributed.rb +++ b/examples/mnist/distributed.rb @@ -12,6 +12,11 @@ end DEFAULT_CHECKPOINT_PATH = File.join(Dir.tmpdir, "mnist_ddp_checkpoint.pt") +DEFAULT_BACKEND = if Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available? + "nccl" +else + Torch::Distributed.get_default_backend_for_device(Torch::Accelerator.current_accelerator) || "gloo" +end class MyNet < Torch::NN::Module def initialize @@ -42,7 +47,7 @@ def parse_options batch_size: 64, lr: 1.0, gamma: 0.7, - backend: "gloo", + backend: DEFAULT_BACKEND, gpus: Torch::CUDA.available? ? [Torch::CUDA.device_count, 1].max : 1, log_interval: 20, data_dir: File.join(__dir__, "data"), @@ -167,7 +172,7 @@ def evaluate(model, device, loader) def run_worker(rank, world_size, port, options) store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?) accelerator = Torch::Accelerator.current_accelerator - backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator) + backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator) || DEFAULT_BACKEND Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size) device = if Torch::CUDA.available? && options[:gpus] > 0 @@ -227,7 +232,7 @@ def run_worker(rank, world_size, port, options) if world_size == 1 run_worker(0, 1, Torch::Distributed.free_port, options) else - Torch::Distributed.fork_world(world_size) do |rank, port| + Torch::Distributed.fork_world(world_size, start_method: :spawn) do |rank, port| run_worker(rank, world_size, port, options) end end diff --git a/ext/torch/cuda.cpp b/ext/torch/cuda.cpp index 23f38d80..69b2529f 100644 --- a/ext/torch/cuda.cpp +++ b/ext/torch/cuda.cpp @@ -1,4 +1,5 @@ #include +#include #include @@ -9,5 +10,11 @@ void init_cuda(Rice::Module& m) { .define_singleton_function("available?", &torch::cuda::is_available) .define_singleton_function("device_count", &torch::cuda::device_count) .define_singleton_function("manual_seed", &torch::cuda::manual_seed) - .define_singleton_function("manual_seed_all", &torch::cuda::manual_seed_all); + .define_singleton_function("manual_seed_all", &torch::cuda::manual_seed_all) + .define_singleton_function( + "set_device", + [](int device_id) { + c10::cuda::set_device(device_id); + return Rice::Nil; + }); } diff --git a/ext/torch/distributed.cpp b/ext/torch/distributed.cpp index de5f7c9a..b3a22bc3 100644 --- a/ext/torch/distributed.cpp +++ b/ext/torch/distributed.cpp @@ -1,11 +1,16 @@ #include #include #include +#include #include #include #include #include +#if defined(USE_C10D) && defined(USE_C10D_NCCL) +#include +#include +#endif #include #include @@ -13,6 +18,7 @@ #include "utils.h" #ifdef USE_C10D +#include #include #include #include @@ -34,7 +40,7 @@ namespace { #ifdef USE_C10D using StorePtr = c10::intrusive_ptr<::c10d::Store>; -using ProcessGroupPtr = c10::intrusive_ptr<::c10d::ProcessGroup>; +using ProcessGroupPtr = c10::intrusive_ptr<::c10d::Backend>; struct StoreWrapper { StoreWrapper() = default; @@ -51,6 +57,24 @@ struct ProcessGroupWrapper { }; ProcessGroupPtr default_process_group; +std::once_flag default_pg_cleanup_once; + +void shutdown_default_process_group() { + if (default_process_group) { + try { + default_process_group->shutdown(); + } catch (...) { + // best effort; ensure reset still happens + } + default_process_group.reset(); + } +} + +void register_default_pg_cleanup() { + std::call_once(default_pg_cleanup_once, []() { + std::atexit([]() { shutdown_default_process_group(); }); + }); +} ProcessGroupPtr resolve_process_group(Rice::Object pg_obj) { if (pg_obj.is_nil()) { @@ -80,6 +104,7 @@ int reduce_op_from_int(int code) { void init_distributed(Rice::Module& m) { auto rb_mDistributed = Rice::define_module_under(m, "Distributed"); #ifdef USE_C10D + register_default_pg_cleanup(); rb_mDistributed.define_singleton_function("available?", []() { return true; }); auto rb_cStore = Rice::define_class_under(rb_mDistributed, "Store"); @@ -116,7 +141,7 @@ void init_distributed(Rice::Module& m) { int world_size, bool is_master, int64_t timeout_millis, - bool wait_for_workers) { + bool wait_for_workers) -> Rice::Object { ::c10d::TCPStoreOptions opts; opts.port = static_cast(port); opts.isServer = is_master; @@ -124,22 +149,23 @@ void init_distributed(Rice::Module& m) { opts.waitWorkers = wait_for_workers; opts.timeout = std::chrono::milliseconds(timeout_millis); auto store = c10::make_intrusive<::c10d::TCPStore>(host, opts); - return Rice::Data_Object(new StoreWrapper(store), rb_cStore, true); + // Pass ownership first, then the Ruby class so Rice doesn't treat the class as the owner flag + return Rice::Data_Object(new StoreWrapper(store), true, rb_cStore); }); rb_mDistributed.define_singleton_function( "_create_file_store", - [rb_cStore](const std::string& path, int world_size) { + [rb_cStore](const std::string& path, int world_size) -> Rice::Object { auto store = c10::make_intrusive<::c10d::FileStore>(path, world_size); - return Rice::Data_Object(new StoreWrapper(store), rb_cStore, true); + return Rice::Data_Object(new StoreWrapper(store), true, rb_cStore); }); #if !defined(_WIN32) rb_mDistributed.define_singleton_function( "_create_hash_store", - [rb_cStore]() { + [rb_cStore]() -> Rice::Object { auto store = c10::make_intrusive<::c10d::HashStore>(); - return Rice::Data_Object(new StoreWrapper(store), rb_cStore, true); + return Rice::Data_Object(new StoreWrapper(store), true, rb_cStore); }); #endif @@ -149,7 +175,8 @@ void init_distributed(Rice::Module& m) { StoreWrapper& store_wrapper, int rank, int world_size, - int64_t timeout_millis) { + int64_t timeout_millis, + int device_id) -> Rice::Object { StorePtr store = store_wrapper.store_; if (!store) { rb_raise(rb_eArgError, "Store is required for init_process_group"); @@ -179,14 +206,32 @@ void init_distributed(Rice::Module& m) { rb_raise(rb_eArgError, "Unsupported backend: %s", backend.c_str()); } + if (device_id >= 0 && backend_lower == "nccl") { +#if defined(USE_C10D_NCCL) + if (!torch::cuda::is_available()) { + rb_raise(rb_eRuntimeError, "CUDA is not available for NCCL backend"); + } + auto device_count = torch::cuda::device_count(); + if (device_id >= static_cast(device_count)) { + rb_raise( + rb_eArgError, + "Invalid device_id %d for NCCL backend (available devices: %d)", + device_id, + static_cast(device_count)); + } + c10::cuda::set_device(device_id); + pg->setBoundDeviceId(c10::Device(c10::kCUDA, device_id)); +#endif + } + default_process_group = pg; - return Rice::Data_Object(new ProcessGroupWrapper(pg), rb_cProcessGroup, true); + return Rice::Data_Object(new ProcessGroupWrapper(pg), true, rb_cProcessGroup); }); rb_mDistributed.define_singleton_function( "_destroy_process_group", []() { - default_process_group.reset(); + shutdown_default_process_group(); return Rice::Nil; }); @@ -198,11 +243,11 @@ void init_distributed(Rice::Module& m) { rb_mDistributed.define_singleton_function( "_default_process_group", - [rb_cProcessGroup]() { + [rb_cProcessGroup]() -> Rice::Object { if (!default_process_group) { return Rice::Nil; } - return Rice::Data_Object(new ProcessGroupWrapper(default_process_group), rb_cProcessGroup, true); + return Rice::Data_Object(new ProcessGroupWrapper(default_process_group), true, rb_cProcessGroup); }); rb_mDistributed.define_singleton_function( @@ -253,6 +298,33 @@ void init_distributed(Rice::Module& m) { return tensor; }); + rb_mDistributed.define_singleton_function( + "_register_ddp_hook", + [](torch::Tensor& tensor, ProcessGroupWrapper& pg_wrapper, int world_size) -> unsigned { + if (!pg_wrapper.pg_) { + rb_raise(rb_eArgError, "Process group is required for DDP hook registration"); + } + if (world_size <= 0) { + rb_raise(rb_eArgError, "world_size must be positive"); + } + + auto pg = pg_wrapper.pg_; + // Register a native autograd hook that all-reduces gradients and scales + // them by the world size. This avoids calling back into Ruby from + // autograd worker threads. + unsigned handle = tensor.register_hook([pg, world_size](const at::Tensor& grad) { + ::c10d::AllreduceOptions opts; + opts.reduceOp = ::c10d::ReduceOp::SUM; + std::vector tensors{grad}; + auto work = pg->allreduce(tensors, opts); + work->wait(); + grad.div_(static_cast(world_size)); + return grad; + }); + + return handle; + }); + auto rb_mReduceOp = Rice::define_module_under(rb_mDistributed, "ReduceOp"); rb_mReduceOp.const_set("SUM", INT2NUM(static_cast(::c10d::ReduceOp::SUM))); rb_mReduceOp.const_set("AVG", INT2NUM(static_cast(::c10d::ReduceOp::AVG))); @@ -264,7 +336,7 @@ void init_distributed(Rice::Module& m) { rb_mReduceOp.const_set("BXOR", INT2NUM(static_cast(::c10d::ReduceOp::BXOR))); rb_mReduceOp.const_set("PREMUL_SUM", INT2NUM(static_cast(::c10d::ReduceOp::PREMUL_SUM))); - rb_mDistributed.const_set("DEFAULT_TIMEOUT", INT2NUM(::c10d::kProcessGroupDefaultTimeout.count() / 1000)); + rb_mDistributed.const_set("DEFAULT_TIMEOUT", INT2NUM(::kProcessGroupDefaultTimeout.count() / 1000)); #else rb_mDistributed.define_singleton_function("available?", []() { return false; }); #endif diff --git a/ext/torch/extconf.rb b/ext/torch/extconf.rb index 0a8250b0..2cadd1c9 100644 --- a/ext/torch/extconf.rb +++ b/ext/torch/extconf.rb @@ -47,6 +47,7 @@ with_cuda = false if Dir["#{lib}/*torch_cuda*"].any? $LDFLAGS += " -L#{cuda_lib}" if Dir.exist?(cuda_lib) + $INCFLAGS += " -I#{cuda_inc}" if Dir.exist?(cuda_inc) $LDFLAGS += " -L#{cudnn_lib}" if Dir.exist?(cudnn_lib) && cudnn_lib != cuda_lib with_cuda = have_library("cuda") && have_library("cudnn") end @@ -70,6 +71,9 @@ $LDFLAGS += " -Wl,--no-as-needed,#{lib}/libtorch.so" end +CONFIG["CC"] = CONFIG["CXX"] +$CFLAGS = $CXXFLAGS + supports_c10d = try_link(<<~CPP, "-DUSE_C10D") #include #include @@ -105,9 +109,21 @@ } CPP -$defs << "-DUSE_C10D" if supports_c10d -$defs << "-DUSE_C10D_GLOO" if supports_c10d_gloo -$defs << "-DUSE_C10D_NCCL" if supports_c10d_nccl +if supports_c10d + $defs << " -DUSE_C10D" + puts "Building with distributed support" +else + puts "Building without distributed support" +end + +if supports_c10d_gloo + $defs << "-DUSE_C10D_GLOO" + puts "GLOO support detected" +end +if supports_c10d_nccl + $defs << "-DUSE_C10D_NCCL" + puts "NCCL support detected" +end # generate C++ functions puts "Generating C++ functions..." diff --git a/ext/torch/tensor.cpp b/ext/torch/tensor.cpp index c7e003d8..d5fb0bc3 100644 --- a/ext/torch/tensor.cpp +++ b/ext/torch/tensor.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include "tensor_functions.h" @@ -38,11 +39,22 @@ struct RubyTensorHook { rb_gc_register_address(&proc_); } + // The autograd engine can invoke hooks from threads not created by Ruby. + // Register the calling thread with Ruby before acquiring the GVL to avoid + // "rb_thread_call_with_gvl() is called by non-ruby thread" crashes. + static void ensure_ruby_thread_registered() { + // ruby_init_stack is idempotent and safe to call repeatedly; it ensures the + // current native thread is known to the VM before we try to grab the GVL. + volatile VALUE stack_anchor = Qnil; + ruby_init_stack(&stack_anchor); + } + ~RubyTensorHook() { rb_gc_unregister_address(&proc_); } at::Tensor call(const at::Tensor& grad) { + ensure_ruby_thread_registered(); HookCallData data{proc_, grad}; rb_thread_call_with_gvl(&RubyTensorHook::invoke, &data); if (data.return_value_defined) { @@ -121,7 +133,7 @@ VALUE tensor_register_hook(int argc, VALUE* argv, VALUE self_) { return hook->call(grad); }); - return Rice::Data_Object(new HookHandle(self, handle, hook), rb_cHookHandle, true); + return Rice::Data_Object(new HookHandle(self, handle, hook), true, rb_cHookHandle); END_HANDLE_TH_ERRORS } diff --git a/lib/torch/distributed.rb b/lib/torch/distributed.rb index 77428b52..7e1e739d 100644 --- a/lib/torch/distributed.rb +++ b/lib/torch/distributed.rb @@ -1,4 +1,5 @@ require "socket" +require "rbconfig" module Torch module Distributed @@ -9,6 +10,15 @@ module Distributed "mps" => "gloo" }.freeze + SPAWN_ENV_KEY = "TORCH_DISTRIBUTED_SPAWNED".freeze + SPAWN_RANK_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_RANK".freeze + SPAWN_WORLD_SIZE_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_WORLD_SIZE".freeze + SPAWN_PORT_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_PORT".freeze + SPAWN_PIPE_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_PIPE".freeze + SPAWN_SCRIPT_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_SCRIPT".freeze + SPAWN_TEST_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_TEST".freeze + SPAWN_ARGV = ARGV.dup.freeze + class << self def initialized? _initialized? @@ -38,8 +48,15 @@ def init_process_group(backend = nil, init_method: "env://", store: nil, rank: n raise ArgumentError, "rank is required" if rank.nil? raise ArgumentError, "world_size is required" if world_size.nil? + device_id ||= default_device_id_for_backend(backend, rank, world_size) + timeout_ms = (timeout * 1000).to_i - _init_process_group(backend, store, rank, world_size, timeout_ms) + bound_device_id = device_id.nil? ? -1 : Integer(device_id) + if backend == "nccl" && bound_device_id >= 0 && Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:set_device) + Torch::CUDA.set_device(bound_device_id) + end + pg = _init_process_group(backend, store, rank, world_size, timeout_ms, bound_device_id) + warmup_process_group(pg, backend) end def destroy_process_group @@ -75,64 +92,101 @@ def broadcast(tensor, src:, group: nil) _broadcast(tensor, src, group) end + def register_ddp_hook(tensor, process_group, world_size) + ensure_process_group!(process_group) + _register_ddp_hook(tensor, process_group, Integer(world_size)) + rescue NoMethodError + # Fallback for environments built without the native helper; this may + # still call back into Ruby from autograd threads. + tensor.register_hook do |grad| + all_reduce(grad, group: process_group) + grad.div!(world_size.to_f) + end + end + def get_default_backend_for_device(device) backend = DEFAULT_DEVICE_BACKENDS[device_type_from(device)] raise ArgumentError, "Default backend not registered for device: #{device.inspect}" unless backend backend end - def fork_world(world_size, host: "127.0.0.1") + def fork_world(world_size, host: "127.0.0.1", start_method: :fork, &block) raise ArgumentError, "world_size must be positive" unless world_size.to_i.positive? - raise ArgumentError, "block required" unless block_given? + raise ArgumentError, "block required" unless block + start_method = normalize_start_method(start_method) + return run_spawn_worker(&block) if start_method == :spawn && spawn_worker? + + fork_spawn_world(world_size, host: host, start_method: start_method, &block) + end + + def fork_spawn_world(world_size, host:, start_method:, &block) port = free_port(host: host) readers = [] pids = [] - world_size.times do |rank| - reader, writer = IO.pipe - pid = fork do - reader.close + pgid = nil + completed = false + + begin + world_size.times do |rank| + reader, writer = IO.pipe begin - writer.binmode - result = yield(rank, port) - Marshal.dump(result, writer) - exit! 0 - rescue => e - Marshal.dump({error: "#{e.class}: #{e.message}", backtrace: e.backtrace}, writer) - exit! 1 - ensure + case start_method + when :fork + pids << fork_worker(reader, writer, rank, port, world_size, &block) + when :spawn + pid, pgid = spawn_worker(reader, writer, rank, port, host: host, world_size: world_size, pgid: pgid) + pids << pid + else + raise ArgumentError, "Unsupported start_method: #{start_method.inspect}" + end + readers << reader writer.close unless writer.closed? + rescue Exception + reader.close unless reader.closed? + writer.close unless writer.closed? + raise end end - writer.close - readers << reader - pids << pid - end - outputs = readers.map do |reader| - data = Marshal.load(reader) - reader.close - data - end + read_failure = Object.new - statuses = pids.each_with_index.map do |pid, idx| - _pid, status = Process.wait2(pid) - [idx, pid, status] - end + outputs = readers.map do |reader| + begin + Marshal.load(reader) + rescue EOFError + read_failure + ensure + reader.close unless reader.closed? + end + end + + statuses = pids.each_with_index.map do |pid, idx| + _pid, status = Process.wait2(pid) + [idx, pid, status] + end - statuses.each do |idx, pid, status| - output = outputs[idx] - if !status.success? || (output.is_a?(Hash) && output[:error]) - message = if output.is_a?(Hash) && output[:error] - "Child #{pid} failed: #{output[:error]}\n#{Array(output[:backtrace]).join("\n")}" - else - "Child #{pid} exited with status #{status.exitstatus}" + statuses.each do |idx, pid, status| + output = outputs[idx] + if output.equal?(read_failure) + raise Torch::Error, "Child #{pid} closed pipe before sending result (status #{status.exitstatus})" + end + if !status.success? || (output.is_a?(Hash) && output[:error]) + message = if output.is_a?(Hash) && output[:error] + "Child #{pid} failed: #{output[:error]}\n#{Array(output[:backtrace]).join("\n")}" + else + "Child #{pid} exited with status #{status.exitstatus}" + end + raise Torch::Error, message end - raise Torch::Error, message end - end - outputs + completed = true + outputs + ensure + # Ensure child workers are cleaned up if an interrupt or error occurs. + terminate_processes(pids, pgid: pgid) unless completed + end end def free_port(host: "127.0.0.1") @@ -150,6 +204,44 @@ def ensure_process_group!(group) raise Torch::Error, "Default process group is not initialized" end + def default_device_id_for_backend(backend, rank, world_size) + return unless backend == "nccl" + + default_local_rank(rank, world_size) + end + + def warmup_process_group(pg, backend) + return pg unless backend == "nccl" + + # Only warm up when a native process group was returned. + # Test helpers may stub out `_init_process_group` and return arbitrary + # Ruby objects, which cannot be passed to the C++ bindings. + return pg unless pg.nil? || (defined?(Torch::Distributed::ProcessGroup) && pg.is_a?(Torch::Distributed::ProcessGroup)) + + # Prime NCCL communicators so the first user-visible collective is fast + _barrier(pg) + pg + rescue + _destroy_process_group + raise + end + + def default_local_rank(rank, world_size) + local_rank = env_integer("LOCAL_RANK") + return local_rank unless local_rank.nil? + + local_world_size = env_integer("LOCAL_WORLD_SIZE") || world_size + return unless local_world_size && rank + + rank % local_world_size if local_world_size.positive? + end + + def env_integer(key) + Integer(ENV[key]) if ENV.key?(key) + rescue ArgumentError + nil + end + def default_backend_for(device_id) get_default_backend_for_device(device_id) end @@ -158,18 +250,174 @@ def device_type_from(device) case device when Torch::Device device.type + when NilClass + accelerator_type || "cpu" when String Torch.device(device).type when Integer - Torch.device("cuda:#{device}").type - when NilClass - Torch::Accelerator.current_accelerator&.type || "cpu" + return accelerator_type || "cpu" if device.negative? + if Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:device_count) + max = Torch::CUDA.device_count + return accelerator_type || "cpu" if max <= 0 || device >= max + return Torch.device("cuda:#{device}").type + end + accelerator_type || "cpu" else + return device.type if device.respond_to?(:type) Torch.device(device).type end rescue => e raise ArgumentError, "Invalid device #{device.inspect}: #{e.message}" end + + def accelerator_type + acc = Torch::Accelerator.current_accelerator + acc.type if acc && acc.respond_to?(:type) + rescue + nil + end + + def normalize_start_method(start_method) + method = start_method&.to_sym + return method if [:fork, :spawn].include?(method) + + raise ArgumentError, "start_method must be :fork or :spawn (got #{start_method.inspect})" + end + + def spawn_worker? + ENV[SPAWN_ENV_KEY] == "1" + end + + def run_spawn_worker(&block) + rank = Integer(ENV.fetch(SPAWN_RANK_ENV_KEY)) + port = Integer(ENV.fetch(SPAWN_PORT_ENV_KEY)) + pipe_fd = Integer(ENV.fetch(SPAWN_PIPE_ENV_KEY)) + + writer = IO.new(pipe_fd, "wb") + writer.binmode + writer.sync = true + + result = block.call(rank, port) + Marshal.dump(result, writer) + writer.flush + writer.close + Process.exit!(0) + rescue Exception => e + begin + if defined?(writer) && writer && !writer.closed? + Marshal.dump({error: "#{e.class}: #{e.message}", backtrace: e.backtrace}, writer) + writer.flush + writer.close + end + rescue StandardError + # best-effort error reporting back to parent + ensure + Process.exit!(1) + end + end + + def fork_worker(reader, writer, rank, port, world_size, &block) + fork do + reader.close + begin + ENV["LOCAL_RANK"] = rank.to_s + ENV["LOCAL_WORLD_SIZE"] = world_size.to_s + ENV["RANK"] = rank.to_s + ENV["WORLD_SIZE"] = world_size.to_s + writer.binmode + writer.sync = true + result = block.call(rank, port) + Marshal.dump(result, writer) + writer.flush + writer.close + Process.exit!(0) + rescue => e + Marshal.dump({error: "#{e.class}: #{e.message}", backtrace: e.backtrace}, writer) + writer.flush + writer.close + Process.exit!(1) + ensure + writer.close unless writer.closed? + end + end + end + + def spawn_worker(reader, writer, rank, port, host:, world_size:, pgid: nil) + writer.binmode + writer.close_on_exec = false + + script = ENV[SPAWN_SCRIPT_ENV_KEY] || $0 + env = { + SPAWN_ENV_KEY => "1", + SPAWN_RANK_ENV_KEY => rank.to_s, + SPAWN_WORLD_SIZE_ENV_KEY => world_size.to_s, + SPAWN_PORT_ENV_KEY => port.to_s, + SPAWN_PIPE_ENV_KEY => writer.fileno.to_s, + "LOCAL_RANK" => rank.to_s, + "LOCAL_WORLD_SIZE" => world_size.to_s, + "MASTER_ADDR" => host, + "MASTER_PORT" => port.to_s, + "RANK" => rank.to_s, + "WORLD_SIZE" => world_size.to_s + } + env["RUBYLIB"] = [ENV["RUBYLIB"], $LOAD_PATH.join(File::PATH_SEPARATOR)].compact.reject(&:empty?).join(File::PATH_SEPARATOR) + + spawn_opts = {close_others: false} + spawn_opts[:pgroup] = pgid ? pgid : true + + pid = Process.spawn(env, RbConfig.ruby, script, *spawn_argv, spawn_opts) + pgid ||= pid + [pid, pgid] + rescue SystemCallError => e + raise Torch::Error, "failed to spawn worker #{rank}: #{e.message}" + end + + def spawn_argv + test_filter = ENV[SPAWN_TEST_ENV_KEY] + return SPAWN_ARGV unless test_filter + return SPAWN_ARGV if SPAWN_ARGV.include?("-n") + + # Restrict child to the specific test that triggered the spawn + SPAWN_ARGV + ["-n", test_filter] + end + + def terminate_processes(pids, pgid: nil) + return if pids.empty? && !pgid + + send_process_group_signal(pgid, "TERM") + pids.each { |pid| safe_kill(pid, "TERM") } + sleep(0.2) + pids.each do |pid| + next unless process_alive?(pid) + + safe_kill(pid, "KILL") + end + pids.each do |pid| + begin + Process.wait(pid) + rescue Errno::ECHILD + end + end + end + + def send_process_group_signal(pgid, sig) + return unless pgid + + Process.kill(sig, -pgid) + rescue Errno::ESRCH + end + + def safe_kill(pid, sig) + Process.kill(sig, pid) + rescue Errno::ESRCH + end + + def process_alive?(pid) + Process.kill(0, pid) + true + rescue Errno::ESRCH + false + end end class TCPStore @@ -193,3 +441,11 @@ def self.new end end end + +at_exit do + begin + Torch::Distributed.destroy_process_group if Torch::Distributed.available? && Torch::Distributed.initialized? + rescue Exception + # best-effort cleanup to avoid leaked process groups + end +end diff --git a/lib/torch/nn/parallel/distributed_data_parallel.rb b/lib/torch/nn/parallel/distributed_data_parallel.rb index 87178f3b..2a1782c8 100644 --- a/lib/torch/nn/parallel/distributed_data_parallel.rb +++ b/lib/torch/nn/parallel/distributed_data_parallel.rb @@ -15,7 +15,7 @@ def initialize(mod, device_ids: nil, process_group: nil, broadcast_buffers: true @world_size = Torch::Distributed.get_world_size(@process_group) @rank = Torch::Distributed.get_rank(@process_group) - @device = Array(device_ids).compact.first + @device = normalize_device(Array(device_ids).compact.first) move_to_device(@device) if @device synchronize_parameters @@ -38,6 +38,19 @@ def train(mode = true) private + def normalize_device(device) + return nil unless device + return device if device.is_a?(Torch::Device) + + if device.is_a?(Integer) + if Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available? + return Torch.device("cuda:#{device}") + end + end + + Torch.device(device) + end + def move_to_device(device) return unless device @@ -89,10 +102,7 @@ def register_parameter_hooks @module.parameters.filter_map do |param| next unless param.requires_grad? - param.register_hook do |grad| - Torch::Distributed.all_reduce(grad, group: @process_group) - grad.div!(@world_size.to_f) - end + Torch::Distributed.register_ddp_hook(param, @process_group, @world_size) end end end diff --git a/lib/torch/tensor.rb b/lib/torch/tensor.rb index 318f14e0..cfc4b63c 100644 --- a/lib/torch/tensor.rb +++ b/lib/torch/tensor.rb @@ -115,7 +115,8 @@ def item if numel != 1 raise Error, "only one element tensors can be converted to Ruby scalars" end - to_a.first + # use flatten to handle tensors with a single element but multiple dimensions + to_a.flatten.first end def to_i diff --git a/lib/torch/torchrun.rb b/lib/torch/torchrun.rb index b5913334..9155b892 100644 --- a/lib/torch/torchrun.rb +++ b/lib/torch/torchrun.rb @@ -203,20 +203,28 @@ def launch_worker_group(restart_count) status ensure + @worker_pgid = nil @current_pids = [] end def spawn_workers(restart_count) base_env = base_environment(restart_count) - Array.new(@local_world_size) do |local_rank| + pgid = nil + workers = Array.new(@local_world_size) do |local_rank| env = base_env.merge(rank_environment(local_rank)) - spawn_worker(env, local_rank) + pid, pgid = spawn_worker(env, local_rank, pgid) + pid end + @worker_pgid = pgid + workers end - def spawn_worker(env, local_rank) + def spawn_worker(env, local_rank, pgid) args = command_arguments(local_rank) - Process.spawn(env, *args) + spawn_opts = pgid ? { pgroup: pgid } : { pgroup: true } + pid = Process.spawn(env, *args, spawn_opts) + pgid ||= pid + [pid, pgid] rescue SystemCallError => e raise Error, "failed to launch worker #{local_rank}: #{e.message}" end @@ -287,6 +295,7 @@ def monitor_workers(pids) def terminate_workers(pids) return if pids.empty? + send_process_group_signal("TERM") pids.each { |pid| send_signal(pid, "TERM") } sleep(0.2) pids.each do |pid| @@ -322,6 +331,7 @@ def setup_signal_handlers end def forward_signal(sig) + send_process_group_signal(sig) (@current_pids || []).each { |pid| send_signal(pid, sig) } end @@ -339,6 +349,14 @@ def send_signal(pid, sig) nil end + def send_process_group_signal(sig) + return unless @worker_pgid + + Process.kill(sig, -@worker_pgid) + rescue Errno::ESRCH + nil + end + def cleanup_workers(pids) pids.each do |pid| next unless process_alive?(pid) diff --git a/test/distributed_test.rb b/test/distributed_test.rb index 487f3af0..659a4930 100644 --- a/test/distributed_test.rb +++ b/test/distributed_test.rb @@ -1,20 +1,149 @@ require_relative "test_helper" +require "torch/distributed" require "socket" -class DistributedTest < Minitest::Test +class DistributedInitProcessGroupTest < Minitest::Test + def setup + skip "Distributed backend not available" unless Torch::Distributed.available? + end + + def test_defaults_nccl_device_id_from_local_rank_env + calls = [] + with_stubbed_init_process_group(calls) do + ENV["LOCAL_RANK"] = "2" + Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 5, world_size: 8) + ensure + ENV.delete("LOCAL_RANK") + end + + assert_equal 1, calls.size + assert_equal 2, calls.first[:device_id] + end + + def test_falls_back_to_local_world_size_modulo + calls = [] + with_stubbed_init_process_group(calls) do + ENV["LOCAL_WORLD_SIZE"] = "2" + Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 3, world_size: 4) + ensure + ENV.delete("LOCAL_WORLD_SIZE") + end + + assert_equal 1, calls.size + assert_equal 1, calls.first[:device_id] + end + + def test_uses_world_size_when_env_missing + calls = [] + with_stubbed_init_process_group(calls) do + Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 1, world_size: 2) + end + + assert_equal 1, calls.size + assert_equal 1, calls.first[:device_id] + end + + private + + def with_stubbed_init_process_group(calls) + original = Torch::Distributed.method(:_init_process_group) + Torch::Distributed.singleton_class.define_method(:_init_process_group) do |backend, store, rank, world_size, timeout_ms, device_id| + calls << {backend: backend, rank: rank, world_size: world_size, timeout_ms: timeout_ms, device_id: device_id} + :stub + end + yield + ensure + Torch::Distributed.singleton_class.define_method(:_init_process_group, original) + end +end + +class DistributedSpawnStartMethodTest < Minitest::Test + def test_spawn_worker_env_runs_block + reader, writer = IO.pipe + writer.close_on_exec = false + + pid = fork do + reader.close + ENV[Torch::Distributed::SPAWN_ENV_KEY] = "1" + ENV[Torch::Distributed::SPAWN_RANK_ENV_KEY] = "0" + ENV[Torch::Distributed::SPAWN_WORLD_SIZE_ENV_KEY] = "1" + ENV[Torch::Distributed::SPAWN_PORT_ENV_KEY] = "1234" + ENV[Torch::Distributed::SPAWN_PIPE_ENV_KEY] = writer.fileno.to_s + Torch::Distributed.fork_world(1, start_method: :spawn) { |rank, port| [rank, port] } + end + + writer.close + result = Marshal.load(reader) + reader.close + + _pid, status = Process.wait2(pid) + assert status.success? + assert_equal [0, 1234], result + end +end + +class DistributedBackendTest < Minitest::Test + BACKEND = nil + def setup super skip "Distributed backend not available" unless Torch::Distributed.available? + skip "No backend configured for test" unless backend + skip_unless_backend_available! end - def test_all_reduce - results = Torch::Distributed.fork_world(2) do |rank, port| - store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?) - Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2) + def backend + self.class::BACKEND + end + + def tensor_options + {} + end + + def skip_unless_backend_available! + skip "#{backend} backend not available" unless backend_available? + end + + def backend_available? + port = Torch::Distributed.free_port + store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 1, true, wait_for_workers: false) + Torch::Distributed.init_process_group(backend, store: store, rank: 0, world_size: 1) + true + rescue StandardError => e + return false if e.message =~ /not available/i || e.message =~ /unsupported backend/i + raise + ensure + Torch::Distributed.destroy_process_group if Torch::Distributed.initialized? + end + + def nccl_device_id(rank) + rank + end - tensor = Torch.tensor([rank + 1.0]) + def fork_with_backend(world_size: 2, start_method: :fork) + original_filter = ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] + original_script = ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] + ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = name if start_method == :spawn + ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = File.expand_path(__FILE__) if start_method == :spawn + Torch::Distributed.fork_world(world_size, start_method: start_method) do |rank, port| + store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?) + device_id = backend == "nccl" ? nccl_device_id(rank) : nil + Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size, device_id: device_id) + begin + yield(rank) + ensure + Torch::Distributed.destroy_process_group + end + end + ensure + ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = original_filter + ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = original_script + end + + def test_all_reduce + results = fork_with_backend do |rank| + tensor = Torch.tensor([rank + 1.0], **tensor_options) Torch::Distributed.all_reduce(tensor) - Torch::Distributed.destroy_process_group tensor.to_a end @@ -22,15 +151,11 @@ def test_all_reduce end def test_barrier - wait_times = Torch::Distributed.fork_world(2) do |rank, port| - store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?) - Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2) - + wait_times = fork_with_backend do |rank| sleep 0.3 if rank.zero? before = Process.clock_gettime(Process::CLOCK_MONOTONIC) Torch::Distributed.barrier after = Process.clock_gettime(Process::CLOCK_MONOTONIC) - Torch::Distributed.destroy_process_group after - before end @@ -39,13 +164,9 @@ def test_barrier end def test_broadcast - tensors = Torch::Distributed.fork_world(2) do |rank, port| - store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?) - Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2) - - tensor = Torch.tensor([rank + 1.0]) + tensors = fork_with_backend do |rank| + tensor = Torch.tensor([rank + 1.0], **tensor_options) Torch::Distributed.broadcast(tensor, src: 0) - Torch::Distributed.destroy_process_group tensor.to_a end @@ -53,25 +174,45 @@ def test_broadcast end def test_ddp_gradient_sync - grads = Torch::Distributed.fork_world(2) do |rank, port| - store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?) - Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2) - + grads = fork_with_backend do |rank| + device = tensor_options[:device] model = Torch::NN::Linear.new(1, 1, bias: false) + model = model.to(device) if device ddp = Torch::NN::Parallel::DistributedDataParallel.new(model) - input = Torch.tensor([[rank + 1.0]]) + input = Torch.tensor([[rank + 1.0]], **tensor_options) output = ddp.call(input) loss = output.sum loss.backward - grad = model.parameters.first.grad.item - Torch::Distributed.destroy_process_group - grad + grad = model.parameters.first.grad + grad = grad.to("cpu") if device + grad.item end grads.each do |grad| assert_in_delta 1.5, grad, 1e-6 end end +end + +class DistributedGlooTest < DistributedBackendTest + BACKEND = "gloo" +end + +class DistributedNcclTest < DistributedBackendTest + BACKEND = "nccl" + + def setup + skip "CUDA not available for NCCL backend" unless Torch.const_defined?(:CUDA) && Torch::CUDA.available? + skip "Need at least 2 CUDA devices for NCCL tests" unless Torch::CUDA.device_count >= 2 + super + end + def tensor_options + {device: "cuda"} + end + + def fork_with_backend(world_size: 2, start_method: :spawn) + super(world_size: world_size, start_method: start_method) + end end diff --git a/test/test_helper.rb b/test/test_helper.rb index 347bbb4f..76f913cf 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -1,7 +1,33 @@ +spawn_worker = ENV["TORCH_DISTRIBUTED_SPAWNED"] == "1" + +# Spawned distributed workers shouldn't try to load minitest plugins from the +# parent test environment. +ENV["MT_NO_PLUGINS"] = "1" if spawn_worker + require "bundler/setup" Bundler.require(:default) require "minitest/autorun" +if spawn_worker + module TorchDistributedSpawnTest + module QuietSummaryReporter + def start # :nodoc: + Minitest::StatisticsReporter.instance_method(:start).bind(self).call + self.sync = io.respond_to?(:"sync=") + self.old_sync, io.sync = io.sync, true if self.sync + end + + def report # :nodoc: + super + ensure + io.sync = self.old_sync if self.sync + end + end + end + + Minitest::SummaryReporter.prepend(TorchDistributedSpawnTest::QuietSummaryReporter) +end + # support require_relative "support/net" From 0286f2a1b8bf96e90f2d93756c121c95d9e67db5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Wed, 19 Nov 2025 22:51:55 +0300 Subject: [PATCH 23/28] possible fix for non-cuda c10 --- ext/torch/cuda.cpp | 32 ++++++++++++++++++++++++-------- ext/torch/extconf.rb | 24 ++++++++++++++++++++---- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/ext/torch/cuda.cpp b/ext/torch/cuda.cpp index 69b2529f..5a7c52e4 100644 --- a/ext/torch/cuda.cpp +++ b/ext/torch/cuda.cpp @@ -1,20 +1,36 @@ #include +#ifdef HAVE_C10_CUDA #include +#endif #include #include "utils.h" void init_cuda(Rice::Module& m) { - Rice::define_module_under(m, "CUDA") + auto rb_mCUDA = Rice::define_module_under(m, "CUDA"); + + rb_mCUDA .define_singleton_function("available?", &torch::cuda::is_available) .define_singleton_function("device_count", &torch::cuda::device_count) .define_singleton_function("manual_seed", &torch::cuda::manual_seed) - .define_singleton_function("manual_seed_all", &torch::cuda::manual_seed_all) - .define_singleton_function( - "set_device", - [](int device_id) { - c10::cuda::set_device(device_id); - return Rice::Nil; - }); + .define_singleton_function("manual_seed_all", &torch::cuda::manual_seed_all); + +#ifdef HAVE_C10_CUDA + rb_mCUDA.define_singleton_function( + "set_device", + [](int device_id) { + c10::cuda::set_device(device_id); + return Rice::Nil; + }); +#else + rb_mCUDA.define_singleton_function( + "set_device", + [](int) { + rb_raise( + rb_eRuntimeError, + "c10 CUDA support is not available in this build; set_device cannot be used"); + return Rice::Nil; + }); +#endif } diff --git a/ext/torch/extconf.rb b/ext/torch/extconf.rb index 2cadd1c9..d30f4982 100644 --- a/ext/torch/extconf.rb +++ b/ext/torch/extconf.rb @@ -55,6 +55,25 @@ $INCFLAGS += " -I#{inc}" $INCFLAGS += " -I#{inc}/torch/csrc/api/include" +CONFIG["CC"] = CONFIG["CXX"] +$CFLAGS = $CXXFLAGS + +supports_c10_cuda = with_cuda && try_compile(<<~CPP) + #include + #include + + int main() { + c10::cuda::set_device(0); + return 0; + } +CPP + +unless supports_c10_cuda + puts "c10 CUDA headers not available; features that require them will be disabled" +else + $defs << " -DHAVE_C10_CUDA" +end + $LDFLAGS += " -Wl,-rpath,#{lib}" if RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i && Dir.exist?("/opt/homebrew/opt/libomp/lib") $LDFLAGS += ",-rpath,/opt/homebrew/opt/libomp/lib" @@ -71,9 +90,6 @@ $LDFLAGS += " -Wl,--no-as-needed,#{lib}/libtorch.so" end -CONFIG["CC"] = CONFIG["CXX"] -$CFLAGS = $CXXFLAGS - supports_c10d = try_link(<<~CPP, "-DUSE_C10D") #include #include @@ -98,7 +114,7 @@ } CPP -supports_c10d_nccl = with_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL") +supports_c10d_nccl = with_cuda && supports_c10_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL") #include #include From 4276d638ff8ca8b8c52310c45b3060c3c4c73ef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Wed, 19 Nov 2025 23:44:38 +0300 Subject: [PATCH 24/28] added missing const_cast --- ext/torch/tensor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/torch/tensor.cpp b/ext/torch/tensor.cpp index d5fb0bc3..353ae7cd 100644 --- a/ext/torch/tensor.cpp +++ b/ext/torch/tensor.cpp @@ -46,7 +46,7 @@ struct RubyTensorHook { // ruby_init_stack is idempotent and safe to call repeatedly; it ensures the // current native thread is known to the VM before we try to grab the GVL. volatile VALUE stack_anchor = Qnil; - ruby_init_stack(&stack_anchor); + ruby_init_stack(const_cast(&stack_anchor)); } ~RubyTensorHook() { From 51785c99bb53ef3ffcdd498aa7e54f1fca9f639c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Wed, 19 Nov 2025 23:44:54 +0300 Subject: [PATCH 25/28] skipping cuda tests when c10 for nccl in not available --- test/distributed_test.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/distributed_test.rb b/test/distributed_test.rb index 659a4930..90c82e17 100644 --- a/test/distributed_test.rb +++ b/test/distributed_test.rb @@ -5,6 +5,7 @@ class DistributedInitProcessGroupTest < Minitest::Test def setup skip "Distributed backend not available" unless Torch::Distributed.available? + skip "CUDA not available for NCCL backend" unless cuda_available? end def test_defaults_nccl_device_id_from_local_rank_env @@ -55,6 +56,10 @@ def with_stubbed_init_process_group(calls) ensure Torch::Distributed.singleton_class.define_method(:_init_process_group, original) end + + def cuda_available? + Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available? + end end class DistributedSpawnStartMethodTest < Minitest::Test From e63f784d12d55452a54d21348a43d9a93895522b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Fri, 21 Nov 2025 17:48:07 +0300 Subject: [PATCH 26/28] GLOO support --- README.md | 38 ++++ examples/benchmark/training.rb | 213 ++++++++++++++++-- ext/torch/distributed.cpp | 1 + ext/torch/extconf.rb | 30 ++- lib/torch/device.rb | 5 +- lib/torch/distributed.rb | 5 +- .../nn/parallel/distributed_data_parallel.rb | 14 +- test/device_test.rb | 5 + test/distributed_test.rb | 52 +++-- 9 files changed, 306 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 78934193..72161779 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,11 @@ Then run: bundle config build.torch-rb --with-torch-dir=/path/to/libtorch ``` +In order to build distributed features (if your LibTorch supports it) add the following to the build config string: +```sh +... --with-cuda-include=/path/to/cuda/include --with-gloo-include=/path/to/gloo/repo +``` + And add this line to your application’s Gemfile: ```ruby @@ -95,6 +100,39 @@ On node 1, change `--node-rank=1`. The launcher restarts workers up to `--max-re For scripts that use the `Torch::Distributed.fork_world` helper directly, set `start_method: :spawn` to launch fresh worker processes instead of forking. This matches Python’s multiprocessing start methods and avoids CUDA fork issues. +### Distributed benchmark + +Generate a comparison table across backends, group sizes, and batch sizes: + +```sh +bundle exec ruby examples/benchmark/training.rb --backends gloo,nccl --batch-sizes 32,64,128,256 --gpus 2 --steps 50 +``` + +Example results on dual RTX 3090s: +Processing speed: images per second. Convergence speed: average loss reduction per step and per second. + +```text +Backend | Proc Group | Batch | Images/s | +--------+------------+-------+----------| +gloo | 1 | 32 | 1724.4 | +gloo | 1 | 64 | 1941.8 | +gloo | 1 | 128 | 2038.7 | +gloo | 1 | 256 | 2171.8 | +gloo | 2 | 32 | 2261.0 | +gloo | 2 | 64 | 2870.6 | +gloo | 2 | 128 | 3398.4 | +gloo | 2 | 256 | 3743.1 | +nccl | 1 | 32 | 1804.8 | +nccl | 1 | 64 | 1963.0 | +nccl | 1 | 128 | 2051.5 | +nccl | 1 | 256 | 2143.3 | +nccl | 2 | 32 | 3046.1 | +nccl | 2 | 64 | 3513.6 | +nccl | 2 | 128 | 3892.1 | +nccl | 2 | 256 | 4024.5 | +--------+------------+-------+----------| +``` + ## API This library follows the [PyTorch API](https://pytorch.org/docs/stable/torch.html). There are a few changes to make it more Ruby-like: diff --git a/examples/benchmark/training.rb b/examples/benchmark/training.rb index 090018f6..83b85520 100644 --- a/examples/benchmark/training.rb +++ b/examples/benchmark/training.rb @@ -13,6 +13,70 @@ else Torch::Distributed.get_default_backend_for_device(Torch::Accelerator.current_accelerator) || "gloo" end +SPAWN_BACKEND_ENV = "TORCH_RB_BENCH_BACKEND".freeze +SPAWN_GROUP_ENV = "TORCH_RB_BENCH_GROUP_SIZE".freeze +SPAWN_BATCH_ENV = "TORCH_RB_BENCH_BATCH_SIZE".freeze + +def parse_list(value) + value.split(",").map(&:strip).reject(&:empty?) +end + +def backend_supported?(backend) + return true unless backend == "nccl" + + Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available? +end + +def usable_cuda_device_count + return 0 unless Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available? + + Torch::CUDA.respond_to?(:device_count) ? Torch::CUDA.device_count : 0 +rescue + 0 +end + +def spawn_worker_process? + ENV[Torch::Distributed::SPAWN_ENV_KEY] == "1" +end + +def apply_spawn_overrides!(options) + return unless ENV[Torch::Distributed::SPAWN_ENV_KEY] == "1" + + if ENV[SPAWN_BACKEND_ENV] + options[:backends] = [ENV[SPAWN_BACKEND_ENV]] + end + + if ENV[SPAWN_GROUP_ENV] + group_size = ENV[SPAWN_GROUP_ENV].to_i + if group_size.positive? + options[:group_sizes] = [group_size] + options[:gpus] = group_size + end + end + + if ENV[SPAWN_BATCH_ENV] + batch_size = ENV[SPAWN_BATCH_ENV].to_i + options[:batch_sizes] = [batch_size] if batch_size.positive? + end +end + +def with_spawn_env(backend:, group_size:, batch_size:) + previous = { + SPAWN_BACKEND_ENV => ENV[SPAWN_BACKEND_ENV], + SPAWN_GROUP_ENV => ENV[SPAWN_GROUP_ENV], + SPAWN_BATCH_ENV => ENV[SPAWN_BATCH_ENV] + } + + ENV[SPAWN_BACKEND_ENV] = backend + ENV[SPAWN_GROUP_ENV] = group_size.to_s + ENV[SPAWN_BATCH_ENV] = batch_size.to_s + + yield +ensure + ENV[SPAWN_BACKEND_ENV] = previous[SPAWN_BACKEND_ENV] + ENV[SPAWN_GROUP_ENV] = previous[SPAWN_GROUP_ENV] + ENV[SPAWN_BATCH_ENV] = previous[SPAWN_BATCH_ENV] +end class MnistCnn < Torch::NN::Module def initialize @@ -47,11 +111,12 @@ def forward(x) def parse_options defaults = { arch: "mnist_cnn", - batch_size: 128, + batch_sizes: [128], steps: 100, warmup: 10, - backend: DEFAULT_BACKEND, + backends: [DEFAULT_BACKEND], gpus: Torch::CUDA.available? ? [Torch::CUDA.device_count, 1].max : 1, + group_sizes: nil, data_dir: File.join(__dir__, "data"), lr: 0.01 } @@ -59,15 +124,19 @@ def parse_options OptionParser.new do |opts| opts.banner = "Usage: ruby examples/benchmark/training.rb [options]" opts.on("--arch NAME", "Architecture to benchmark (#{ARCH_CONFIGS.keys.join(', ')}, default: #{defaults[:arch]})") { |v| defaults[:arch] = v } - opts.on("--batch-size N", Integer, "Batch size per process (default: #{defaults[:batch_size]})") { |v| defaults[:batch_size] = v } + opts.on("--batch-size N", Integer, "Batch size per process (default: #{defaults[:batch_sizes].first})") { |v| defaults[:batch_sizes] = [v] } + opts.on("--batch-sizes LIST", String, "Comma-separated batch sizes per process") { |v| defaults[:batch_sizes] = parse_list(v).map(&:to_i) } opts.on("--steps N", Integer, "Number of timed training steps (default: #{defaults[:steps]})") { |v| defaults[:steps] = v } opts.on("--warmup N", Integer, "Number of warmup steps not included in timing (default: #{defaults[:warmup]})") { |v| defaults[:warmup] = v } - opts.on("--backend NAME", String, "Process group backend (default: #{defaults[:backend]})") { |v| defaults[:backend] = v } + opts.on("--backend NAME", String, "Process group backend (default: #{defaults[:backends].first})") { |v| defaults[:backends] = [v] } + opts.on("--backends LIST", String, "Comma-separated list of backends to benchmark (gloo,nccl)") { |v| defaults[:backends] = parse_list(v) } opts.on("--gpus N", Integer, "Number of GPUs/processes to use (1 for non-distributed)") { |v| defaults[:gpus] = v } + opts.on("--group-sizes LIST", String, "Process group sizes to benchmark (default: 1..gpus)") { |v| defaults[:group_sizes] = parse_list(v).map(&:to_i) } opts.on("--data-dir PATH", String, "Directory for cached datasets (default: #{defaults[:data_dir]})") { |v| defaults[:data_dir] = v } opts.on("--lr FLOAT", Float, "Learning rate (default: #{defaults[:lr]})") { |v| defaults[:lr] = v } end.parse!(ARGV) + defaults[:group_sizes] ||= (1..defaults[:gpus]).to_a defaults end @@ -110,15 +179,16 @@ def benchmark_worker(rank, world_size, port, options) raise ArgumentError, "Unsupported architecture #{arch.inspect}" unless config distributed = world_size > 1 + accelerator = Torch::Accelerator.current_accelerator + selected_backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator) || DEFAULT_BACKEND if distributed store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?) - accelerator = Torch::Accelerator.current_accelerator - backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator) || DEFAULT_BACKEND - Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size) + Torch::Distributed.init_process_group(selected_backend, store: store, rank: rank, world_size: world_size) end - device = if Torch::CUDA.available? && options[:gpus] > 0 - Torch.device("cuda:#{rank % Torch::CUDA.device_count}") + cuda_devices = usable_cuda_device_count + device = if cuda_devices.positive? && options[:gpus] > 0 + Torch.device("cuda:#{rank % cuda_devices}") else Torch.device("cpu") end @@ -139,7 +209,9 @@ def benchmark_worker(rank, world_size, port, options) warmup_steps = options[:warmup] timed_steps = options[:steps] total_steps = warmup_steps + timed_steps + losses = [] + # Warm up the model (including one full timed-length pass) to avoid init overhead in measurements. step_idx = 0 loader.each do |data, target| data = data.to(device) @@ -169,6 +241,14 @@ def benchmark_worker(rank, world_size, port, options) loss.backward optimizer.step + loss_value = loss.item + if distributed + loss_tensor = Torch.tensor([loss_value], device: device) + Torch::Distributed.all_reduce(loss_tensor) + loss_value = loss_tensor.item / world_size.to_f + end + losses << loss_value if !distributed || rank.zero? + step_idx += 1 break if step_idx >= timed_steps end @@ -178,30 +258,115 @@ def benchmark_worker(rank, world_size, port, options) elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start timed = step_idx - if rank.zero? - images = timed * options[:batch_size] * world_size - puts "Architecture: #{arch}" - puts "Dataset: #{config[:dataset]}" - puts "GPUs: #{world_size}" - puts "Batch size per process: #{options[:batch_size]}" - puts "Timed steps: #{timed}" - puts "Total images: #{images}" - puts format("Elapsed: %.3fs | Throughput: %.1f images/s", elapsed, images / elapsed) + images = timed * options[:batch_size] * world_size + throughput = elapsed.positive? ? images.to_f / elapsed : 0.0 + initial_loss = losses.first || 0.0 + final_loss = losses.last || initial_loss + loss_delta = initial_loss - final_loss + loss_delta_per_step = timed.zero? ? 0.0 : loss_delta / timed + loss_delta_per_sec = elapsed.zero? ? 0.0 : loss_delta / elapsed + + result = if !distributed || rank.zero? + { + backend: selected_backend, + world_size: world_size, + batch_size: options[:batch_size], + arch: arch, + dataset: config[:dataset], + elapsed: elapsed, + timed_steps: timed, + images: images, + throughput: throughput, + initial_loss: initial_loss, + final_loss: final_loss, + loss_delta: loss_delta, + loss_delta_per_step: loss_delta_per_step, + loss_delta_per_sec: loss_delta_per_sec + } end Torch::Distributed.destroy_process_group if distributed + result +end + +def run_benchmark_case(world_size, options) + if world_size > 1 + outputs = Torch::Distributed.fork_world(world_size, start_method: :spawn) do |rank, port| + benchmark_worker(rank, world_size, port, options) + end + outputs.compact.first + else + benchmark_worker(0, 1, Torch::Distributed.free_port, options) + end +end + +def print_summary_table(results) + puts "\nBenchmark comparison (processing vs convergence)" + puts "Processing speed: images per second. Convergence speed: average loss reduction per step and per second.\n" + + headers = ["Backend", "Proc Group", "Batch", "Images/s", "Loss delta/step", "Loss delta/s", "Final loss"] + formatters = [ + ->(r) { r[:backend] }, + ->(r) { r[:world_size] }, + ->(r) { r[:batch_size] }, + ->(r) { format("%.1f", r[:throughput]) }, + ->(r) { format("%.4f", r[:loss_delta_per_step]) }, + ->(r) { format("%.4f", r[:loss_delta_per_sec]) }, + ->(r) { format("%.4f", r[:final_loss]) } + ] + + widths = headers.each_with_index.map do |header, idx| + [header.length, results.map { |r| formatters[idx].call(r).to_s.length }.max].compact.max + end + + header_line = headers.each_with_index.map { |h, idx| h.ljust(widths[idx]) }.join(" | ") + divider = widths.map { |w| "-" * w }.join("-+-") + puts header_line + puts divider + + results.sort_by { |r| [r[:backend], r[:world_size], r[:batch_size]] }.each do |result| + row = formatters.each_with_index.map { |formatter, idx| formatter.call(result).to_s.ljust(widths[idx]) } + puts row.join(" | ") + end end options = parse_options -world_size = options[:gpus] -raise "Number of GPUs requested must be >= 1" if world_size < 1 +apply_spawn_overrides!(options) +max_world_size = options[:gpus] +raise "Number of GPUs requested must be >= 1" if max_world_size < 1 Torch.manual_seed(1) -if world_size > 1 +group_sizes = options[:group_sizes].map { |v| [v, max_world_size].min }.select { |v| v >= 1 }.uniq.sort +batch_sizes = options[:batch_sizes].map { |v| [v, 1].max }.uniq +backends = options[:backends].map(&:downcase).uniq + +if group_sizes.any? { |size| size > 1 } raise "torch.distributed is not available" unless Torch::Distributed.available? - Torch::Distributed.fork_world(world_size, start_method: :spawn) do |rank, port| - benchmark_worker(rank, world_size, port, options) +end + +results = [] + +backends.each do |backend| + unless backend_supported?(backend) + warn "Skipping backend=#{backend} because required accelerator support is unavailable." + next end + + group_sizes.each do |world_size| + batch_sizes.each do |batch_size| + run_options = options.merge(batch_size: batch_size, backend: backend, gpus: world_size) + puts "Running backend=#{backend}, group_size=#{world_size}, batch_size=#{batch_size}..." unless spawn_worker_process? + with_spawn_env(backend: backend, group_size: world_size, batch_size: batch_size) do + results << run_benchmark_case(world_size, run_options) + end + end + end +end + +results.compact! + +if results.empty? + puts "No benchmark results to report." else - benchmark_worker(0, 1, Torch::Distributed.free_port, options) + print_summary_table(results) end diff --git a/ext/torch/distributed.cpp b/ext/torch/distributed.cpp index b3a22bc3..a3d50680 100644 --- a/ext/torch/distributed.cpp +++ b/ext/torch/distributed.cpp @@ -198,6 +198,7 @@ void init_distributed(Rice::Module& m) { } else if (backend_lower == "nccl") { #if defined(USE_C10D_NCCL) auto options = c10::make_intrusive<::c10d::ProcessGroupNCCL::Options>(); + options->timeout = std::chrono::milliseconds(timeout_millis); pg = c10::make_intrusive<::c10d::ProcessGroupNCCL>(store, rank, world_size, options); #else rb_raise(rb_eRuntimeError, "NCCL backend is not available in this build"); diff --git a/ext/torch/extconf.rb b/ext/torch/extconf.rb index d30f4982..916588d9 100644 --- a/ext/torch/extconf.rb +++ b/ext/torch/extconf.rb @@ -38,6 +38,9 @@ cudnn_inc, cudnn_lib = dir_config("cudnn") cudnn_lib ||= "/usr/local/cuda/lib" +gloo_inc, _ = dir_config("gloo") +gloo_inc ||= "./vendor/gloo" + $LDFLAGS += " -L#{lib}" if Dir.exist?(lib) abort "LibTorch not found" unless have_library("torch") @@ -68,9 +71,7 @@ } CPP -unless supports_c10_cuda - puts "c10 CUDA headers not available; features that require them will be disabled" -else +if supports_c10_cuda $defs << " -DHAVE_C10_CUDA" end @@ -100,6 +101,19 @@ } CPP +if supports_c10d + $defs << " -DUSE_C10D" + puts "Building with distributed support" + + if find_header("gloo/algorithm.h", gloo_inc) + $INCFLAGS += " -I#{gloo_inc}" + else + puts "GLOO headers not found. Consider setting --with-gloo-include param" + end +else + puts "Building without distributed support" +end + supports_c10d_gloo = supports_c10d && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_GLOO") #include #include @@ -125,17 +139,13 @@ } CPP -if supports_c10d - $defs << " -DUSE_C10D" - puts "Building with distributed support" -else - puts "Building without distributed support" -end - if supports_c10d_gloo $defs << "-DUSE_C10D_GLOO" puts "GLOO support detected" end +unless supports_c10_cuda + puts "No c10 CUDA headers found. NCCL is unavailable" +end if supports_c10d_nccl $defs << "-DUSE_C10D_NCCL" puts "NCCL support detected" diff --git a/lib/torch/device.rb b/lib/torch/device.rb index f80868ff..0621e463 100644 --- a/lib/torch/device.rb +++ b/lib/torch/device.rb @@ -8,7 +8,10 @@ def inspect extra = ", index: #{index.inspect}" if index? "device(type: #{type.inspect}#{extra})" end - alias_method :to_s, :inspect + + def to_s + _str + end def ==(other) eql?(other) diff --git a/lib/torch/distributed.rb b/lib/torch/distributed.rb index 7e1e739d..d69e0396 100644 --- a/lib/torch/distributed.rb +++ b/lib/torch/distributed.rb @@ -53,7 +53,10 @@ def init_process_group(backend = nil, init_method: "env://", store: nil, rank: n timeout_ms = (timeout * 1000).to_i bound_device_id = device_id.nil? ? -1 : Integer(device_id) if backend == "nccl" && bound_device_id >= 0 && Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:set_device) - Torch::CUDA.set_device(bound_device_id) + device_count = Torch::CUDA.device_count if Torch::CUDA.respond_to?(:device_count) + # Only attempt to switch devices when the requested id exists to avoid + # raising on hosts with fewer GPUs than the provided local rank. + Torch::CUDA.set_device(bound_device_id) if device_count.nil? || bound_device_id < device_count end pg = _init_process_group(backend, store, rank, world_size, timeout_ms, bound_device_id) warmup_process_group(pg, backend) diff --git a/lib/torch/nn/parallel/distributed_data_parallel.rb b/lib/torch/nn/parallel/distributed_data_parallel.rb index 2a1782c8..dd5e0245 100644 --- a/lib/torch/nn/parallel/distributed_data_parallel.rb +++ b/lib/torch/nn/parallel/distributed_data_parallel.rb @@ -84,17 +84,21 @@ def move_value(value, device) def synchronize_parameters Torch::Distributed.barrier(group: @process_group) - @module.parameters.each do |param| - Torch::Distributed.broadcast(param, src: 0, group: @process_group) + Torch.no_grad do + @module.parameters.each do |param| + Torch::Distributed.broadcast(param, src: 0, group: @process_group) + end + broadcast_buffers_if_needed end - broadcast_buffers_if_needed end def broadcast_buffers_if_needed return unless @broadcast_buffers - @module.buffers.each do |buffer| - Torch::Distributed.broadcast(buffer, src: 0, group: @process_group) + Torch.no_grad do + @module.buffers.each do |buffer| + Torch::Distributed.broadcast(buffer, src: 0, group: @process_group) + end end end diff --git a/test/device_test.rb b/test/device_test.rb index 69f778f6..b31b3348 100644 --- a/test/device_test.rb +++ b/test/device_test.rb @@ -22,4 +22,9 @@ def test_inspect assert_equal %!device(type: "cpu")!, Torch.device("cpu").inspect assert_equal %!device(type: "cpu", index: 0)!, Torch.device("cpu:0").inspect end + + def test_to_s + assert_equal "cpu", Torch.device("cpu").to_s + assert_equal "cpu:0", Torch.device("cpu:0").to_s + end end diff --git a/test/distributed_test.rb b/test/distributed_test.rb index 90c82e17..7491c9a7 100644 --- a/test/distributed_test.rb +++ b/test/distributed_test.rb @@ -1,6 +1,7 @@ require_relative "test_helper" require "torch/distributed" require "socket" +require "timeout" class DistributedInitProcessGroupTest < Minitest::Test def setup @@ -46,6 +47,8 @@ def test_uses_world_size_when_env_missing private + # Stub out low-level init to capture arguments without starting a real process group + # Used for upper-level tests that don't require actial process group spawning def with_stubbed_init_process_group(calls) original = Torch::Distributed.method(:_init_process_group) Torch::Distributed.singleton_class.define_method(:_init_process_group) do |backend, store, rank, world_size, timeout_ms, device_id| @@ -110,9 +113,10 @@ def skip_unless_backend_available! end def backend_available? + timeout = distributed_timeout port = Torch::Distributed.free_port - store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 1, true, wait_for_workers: false) - Torch::Distributed.init_process_group(backend, store: store, rank: 0, world_size: 1) + store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 1, true, wait_for_workers: false, timeout: timeout) + Torch::Distributed.init_process_group(backend, store: store, rank: 0, world_size: 1, timeout: timeout) true rescue StandardError => e return false if e.message =~ /not available/i || e.message =~ /unsupported backend/i @@ -121,23 +125,30 @@ def backend_available? Torch::Distributed.destroy_process_group if Torch::Distributed.initialized? end - def nccl_device_id(rank) - rank - end - - def fork_with_backend(world_size: 2, start_method: :fork) + def fork_with_backend(world_size: 2, start_method: :spawn) + timeout = distributed_timeout original_filter = ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] original_script = ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = name if start_method == :spawn ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = File.expand_path(__FILE__) if start_method == :spawn - Torch::Distributed.fork_world(world_size, start_method: start_method) do |rank, port| - store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?) - device_id = backend == "nccl" ? nccl_device_id(rank) : nil - Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size, device_id: device_id) - begin - yield(rank) - ensure - Torch::Distributed.destroy_process_group + Timeout.timeout(timeout, Timeout::Error, "distributed test exceeded #{timeout}s") do + Torch::Distributed.fork_world(world_size, start_method: start_method) do |rank, port| + Timeout.timeout(timeout, Timeout::Error, "distributed worker #{rank} exceeded #{timeout}s") do + store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?, timeout: timeout) + Torch::Distributed.init_process_group( + backend, + store: store, + rank: rank, + world_size: world_size, + device_id: rank, + timeout: timeout + ) + begin + yield(rank) + ensure + Torch::Distributed.destroy_process_group + end + end end end ensure @@ -179,7 +190,8 @@ def test_broadcast end def test_ddp_gradient_sync - grads = fork_with_backend do |rank| + # autograd cannot run safely with fork-based multiprocessing; always use spawn here + grads = fork_with_backend(start_method: :spawn) do |rank| device = tensor_options[:device] model = Torch::NN::Linear.new(1, 1, bias: false) model = model.to(device) if device @@ -198,10 +210,18 @@ def test_ddp_gradient_sync assert_in_delta 1.5, grad, 1e-6 end end + + def distributed_timeout + Integer(ENV.fetch("TORCH_DISTRIBUTED_TEST_TIMEOUT", "30")) + end end class DistributedGlooTest < DistributedBackendTest BACKEND = "gloo" + + def fork_with_backend(world_size: 2, start_method: :fork) + super(world_size: world_size, start_method: start_method) + end end class DistributedNcclTest < DistributedBackendTest From 3ab737c4edf75feee07ecbdbc3a88a2677a1f53f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Fri, 5 Dec 2025 11:34:46 +0300 Subject: [PATCH 27/28] only multi-device core functionality remains --- README.md | 79 ++----------------------------------- ext/torch/ext.cpp | 2 - ext/torch/extconf.rb | 66 +------------------------------ lib/torch.rb | 2 - lib/torch/nn/module.rb | 53 ------------------------- lib/torch/nn/module_list.rb | 6 ++- test/nn/module_test.rb | 13 ------ test/test_helper.rb | 26 ------------ torch-rb.gemspec | 4 +- 9 files changed, 11 insertions(+), 240 deletions(-) diff --git a/README.md b/README.md index 72161779..320e6d3a 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ As well as: First, [download LibTorch](https://pytorch.org/get-started/locally/). For Mac arm64, use: ```sh -curl -L https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.9.0.zip > libtorch.zip +curl -L https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.9.1.zip > libtorch.zip unzip -q libtorch.zip ``` @@ -34,11 +34,6 @@ Then run: bundle config build.torch-rb --with-torch-dir=/path/to/libtorch ``` -In order to build distributed features (if your LibTorch supports it) add the following to the build config string: -```sh -... --with-cuda-include=/path/to/cuda/include --with-gloo-include=/path/to/gloo/repo -``` - And add this line to your application’s Gemfile: ```ruby @@ -47,6 +42,8 @@ gem "torch-rb" It can take 5-10 minutes to compile the extension. Windows is not currently supported. +For distributed data parallel helpers, add the optional `torch-ddp` gem alongside this one. + ## Getting Started A good place to start is [Deep Learning with Torch.rb: A 60 Minute Blitz](tutorials/blitz/README.md). @@ -60,79 +57,9 @@ A good place to start is [Deep Learning with Torch.rb: A 60 Minute Blitz](tutori ## Examples - [Image classification with MNIST](examples/mnist) ([日本語版](https://qiita.com/kojix2/items/c19c36dc1bf73ea93409)) -- [Distributed MNIST training](examples/mnist/distributed.rb) -- [Training benchmarks (variable batch size / GPU count)](examples/benchmark/training.rb) - [Collaborative filtering with MovieLens](examples/movielens) - [Generative adversarial networks](examples/gan) -Run the benchmark with: - -```sh -bundle exec ruby examples/benchmark/training.rb --arch mnist_cnn --batch-size 256 --gpus 1 --steps 50 -``` - -Set `--gpus` to 2+ to enable distributed training; `--steps` measures only timed steps and `--warmup` sets warmup iterations. - -## Distributed Training - -Torch.rb ships with a `torchrun` launcher that mirrors the PyTorch CLI. It handles process orchestration and sets the `RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, and `MASTER_PORT` environment variables expected by `Torch::Distributed.init_process_group`. - -Start a single-node job with a process per GPU (or CPU) with: - -```sh -bundle exec torchrun --standalone --nproc-per-node=gpu path/to/training_script.rb --script-arg value -``` - -For multi-node runs, launch the same command on every node with matching rendezvous settings: - -```sh -bundle exec torchrun \ - --nnodes=2 \ - --node-rank=0 \ - --rdzv-backend=c10d \ - --rdzv-endpoint=host0.example.com:29503 \ - --rdzv-id=my-job \ - --nproc-per-node=4 \ - path/to/training_script.rb -``` - -On node 1, change `--node-rank=1`. The launcher restarts workers up to `--max-restarts` times and can be combined with tools like `bundle exec` or custom scripts via `--no-ruby`. - -For scripts that use the `Torch::Distributed.fork_world` helper directly, set `start_method: :spawn` to launch fresh worker processes instead of forking. This matches Python’s multiprocessing start methods and avoids CUDA fork issues. - -### Distributed benchmark - -Generate a comparison table across backends, group sizes, and batch sizes: - -```sh -bundle exec ruby examples/benchmark/training.rb --backends gloo,nccl --batch-sizes 32,64,128,256 --gpus 2 --steps 50 -``` - -Example results on dual RTX 3090s: -Processing speed: images per second. Convergence speed: average loss reduction per step and per second. - -```text -Backend | Proc Group | Batch | Images/s | ---------+------------+-------+----------| -gloo | 1 | 32 | 1724.4 | -gloo | 1 | 64 | 1941.8 | -gloo | 1 | 128 | 2038.7 | -gloo | 1 | 256 | 2171.8 | -gloo | 2 | 32 | 2261.0 | -gloo | 2 | 64 | 2870.6 | -gloo | 2 | 128 | 3398.4 | -gloo | 2 | 256 | 3743.1 | -nccl | 1 | 32 | 1804.8 | -nccl | 1 | 64 | 1963.0 | -nccl | 1 | 128 | 2051.5 | -nccl | 1 | 256 | 2143.3 | -nccl | 2 | 32 | 3046.1 | -nccl | 2 | 64 | 3513.6 | -nccl | 2 | 128 | 3892.1 | -nccl | 2 | 256 | 4024.5 | ---------+------------+-------+----------| -``` - ## API This library follows the [PyTorch API](https://pytorch.org/docs/stable/torch.html). There are a few changes to make it more Ruby-like: diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp index dc9cef20..c07528b8 100644 --- a/ext/torch/ext.cpp +++ b/ext/torch/ext.cpp @@ -7,7 +7,6 @@ void init_linalg(Rice::Module& m); void init_nn(Rice::Module& m); void init_special(Rice::Module& m); void init_accelerator(Rice::Module& m); -void init_distributed(Rice::Module& m); void init_tensor(Rice::Module& m, Rice::Class& c, Rice::Class& rb_cTensorOptions); void init_torch(Rice::Module& m); @@ -49,5 +48,4 @@ void Init_ext() { init_generator(m, rb_cGenerator); init_ivalue(m, rb_cIValue); init_random(m); - init_distributed(m); } diff --git a/ext/torch/extconf.rb b/ext/torch/extconf.rb index 916588d9..0032088d 100644 --- a/ext/torch/extconf.rb +++ b/ext/torch/extconf.rb @@ -38,9 +38,6 @@ cudnn_inc, cudnn_lib = dir_config("cudnn") cudnn_lib ||= "/usr/local/cuda/lib" -gloo_inc, _ = dir_config("gloo") -gloo_inc ||= "./vendor/gloo" - $LDFLAGS += " -L#{lib}" if Dir.exist?(lib) abort "LibTorch not found" unless have_library("torch") @@ -50,7 +47,7 @@ with_cuda = false if Dir["#{lib}/*torch_cuda*"].any? $LDFLAGS += " -L#{cuda_lib}" if Dir.exist?(cuda_lib) - $INCFLAGS += " -I#{cuda_inc}" if Dir.exist?(cuda_inc) + $INCFLAGS += " -I#{cuda_inc}" if cuda_inc && Dir.exist?(cuda_inc) $LDFLAGS += " -L#{cudnn_lib}" if Dir.exist?(cudnn_lib) && cudnn_lib != cuda_lib with_cuda = have_library("cuda") && have_library("cudnn") end @@ -61,6 +58,7 @@ CONFIG["CC"] = CONFIG["CXX"] $CFLAGS = $CXXFLAGS +abort "cuda.h not found" if with_cuda && !find_header("cuda.h") supports_c10_cuda = with_cuda && try_compile(<<~CPP) #include #include @@ -91,66 +89,6 @@ $LDFLAGS += " -Wl,--no-as-needed,#{lib}/libtorch.so" end -supports_c10d = try_link(<<~CPP, "-DUSE_C10D") - #include - #include - - int main() { - ::c10d::FileStore store("unused", 1); - return 0; - } -CPP - -if supports_c10d - $defs << " -DUSE_C10D" - puts "Building with distributed support" - - if find_header("gloo/algorithm.h", gloo_inc) - $INCFLAGS += " -I#{gloo_inc}" - else - puts "GLOO headers not found. Consider setting --with-gloo-include param" - end -else - puts "Building without distributed support" -end - -supports_c10d_gloo = supports_c10d && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_GLOO") - #include - #include - #include - - int main() { - auto store = c10::make_intrusive<::c10d::FileStore>("unused", 1); - auto opts = ::c10d::ProcessGroupGloo::Options::create(); - opts->devices.push_back(::c10d::ProcessGroupGloo::createDefaultDevice()); - ::c10d::ProcessGroupGloo pg(store, 0, 1, opts); - return static_cast(pg.getRank()); - } -CPP - -supports_c10d_nccl = with_cuda && supports_c10_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL") - #include - #include - - int main() { - auto opts = c10::make_intrusive<::c10d::ProcessGroupNCCL::Options>(); - opts->is_high_priority_stream = false; - return 0; - } -CPP - -if supports_c10d_gloo - $defs << "-DUSE_C10D_GLOO" - puts "GLOO support detected" -end -unless supports_c10_cuda - puts "No c10 CUDA headers found. NCCL is unavailable" -end -if supports_c10d_nccl - $defs << "-DUSE_C10D_NCCL" - puts "NCCL support detected" -end - # generate C++ functions puts "Generating C++ functions..." require_relative "../../codegen/generate_functions" diff --git a/lib/torch.rb b/lib/torch.rb index dd652872..667315a5 100644 --- a/lib/torch.rb +++ b/lib/torch.rb @@ -10,7 +10,6 @@ # modules require_relative "torch/device" require_relative "torch/accelerator" -require_relative "torch/distributed" require_relative "torch/inspector" require_relative "torch/tensor" require_relative "torch/version" @@ -193,7 +192,6 @@ require_relative "torch/nn/functional" require_relative "torch/nn/functional_attention" require_relative "torch/nn/init" -require_relative "torch/nn/parallel/distributed_data_parallel" # utils require_relative "torch/utils/data" diff --git a/lib/torch/nn/module.rb b/lib/torch/nn/module.rb index df0500b5..f9a76f98 100644 --- a/lib/torch/nn/module.rb +++ b/lib/torch/nn/module.rb @@ -437,58 +437,5 @@ def dup_value(v, memo) end end end - - class ModuleList < Module - def initialize(mods = nil) - super() - - return unless mods - self.extend(mods) - end - - def length - @modules.length - end - - alias :count :length - - def extend(mods) - raise ArgumentError, "Modules should respond to #each" unless mods.respond_to?(:each) - - mods.each { |m| append m } - - self - end - - def each(&block) - @modules.values.each &block - end - - def map(&block) - @modules.values.map &block - end - - def inject(inj, &block) - @modules.values.inject(inj, &block) - end - - def append(mod) - raise ArgumentError, "Provided element is not a module" unless mod.is_a?(Module) - add_module(length.to_s, mod) - self - end - - def [](*idx) - idx.map do |id| - if id.is_a?(Integer) - @modules[id.to_s] - elsif id.is_a?(Range) - id.each do |i| - @modules[i.to_s] - end - end - end.flatten - end - end end end diff --git a/lib/torch/nn/module_list.rb b/lib/torch/nn/module_list.rb index 925bab5b..02c17575 100644 --- a/lib/torch/nn/module_list.rb +++ b/lib/torch/nn/module_list.rb @@ -6,7 +6,7 @@ class ModuleList < Module def initialize(mods = nil) super() - self.concat(mods) if mods + concat(mods) if mods end def length @@ -31,6 +31,10 @@ def each(&block) end end + def map(&block) + @modules.values.map(&block) + end + def append(mod) raise ArgumentError, "Provided element is not a module" unless mod.is_a?(Module) add_module(length.to_s, mod) diff --git a/test/nn/module_test.rb b/test/nn/module_test.rb index 33790da0..d52429ea 100644 --- a/test/nn/module_test.rb +++ b/test/nn/module_test.rb @@ -71,19 +71,6 @@ def test_state_dict_buffers net.eval end - def test_state_dict_with_buffers - net = SimpleResidualBlock.new - expected_keys = %w[seq.0.weight seq.1.weight seq.1.bias seq.1.running_mean seq.1.running_var seq.1.num_batches_tracked seq.3.weight seq.4.weight seq.4.bias seq.4.running_mean seq.4.running_var seq.4.num_batches_tracked seq.6.weight seq.7.weight seq.7.bias seq.7.running_mean seq.7.running_var seq.7.num_batches_tracked] - assert_equal expected_keys, net.state_dict.keys - - tmpfile = Tempfile.new - Torch.save net.state_dict, tmpfile.path - - net = SimpleResidualBlock.new - net.load_state_dict Torch.load tmpfile.path - net.eval - end - def test_inspect assert_match "(conv1): Conv2d(1, 6, kernel_size: [3, 3], stride: [1, 1])", net.inspect end diff --git a/test/test_helper.rb b/test/test_helper.rb index 76f913cf..347bbb4f 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -1,33 +1,7 @@ -spawn_worker = ENV["TORCH_DISTRIBUTED_SPAWNED"] == "1" - -# Spawned distributed workers shouldn't try to load minitest plugins from the -# parent test environment. -ENV["MT_NO_PLUGINS"] = "1" if spawn_worker - require "bundler/setup" Bundler.require(:default) require "minitest/autorun" -if spawn_worker - module TorchDistributedSpawnTest - module QuietSummaryReporter - def start # :nodoc: - Minitest::StatisticsReporter.instance_method(:start).bind(self).call - self.sync = io.respond_to?(:"sync=") - self.old_sync, io.sync = io.sync, true if self.sync - end - - def report # :nodoc: - super - ensure - io.sync = self.old_sync if self.sync - end - end - end - - Minitest::SummaryReporter.prepend(TorchDistributedSpawnTest::QuietSummaryReporter) -end - # support require_relative "support/net" diff --git a/torch-rb.gemspec b/torch-rb.gemspec index 40c89325..0adcc03b 100644 --- a/torch-rb.gemspec +++ b/torch-rb.gemspec @@ -10,9 +10,7 @@ Gem::Specification.new do |spec| spec.author = "Andrew Kane" spec.email = "andrew@ankane.org" - spec.files = Dir["*.{md,txt}", "{codegen,ext,lib}/**/*", "bin/*"] - spec.executables = Dir["bin/*"].map { |file| File.basename(file) } - spec.bindir = "bin" + spec.files = Dir["*.{md,txt}", "{codegen,ext,lib}/**/*"] spec.require_path = "lib" spec.extensions = ["ext/torch/extconf.rb"] From 03f468264bc112cf1b0bc1f887905f3841aa10a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?= =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= Date: Fri, 5 Dec 2025 11:40:18 +0300 Subject: [PATCH 28/28] distributed tests removed --- test/distributed_test.rb | 243 --------------------------------------- 1 file changed, 243 deletions(-) delete mode 100644 test/distributed_test.rb diff --git a/test/distributed_test.rb b/test/distributed_test.rb deleted file mode 100644 index 7491c9a7..00000000 --- a/test/distributed_test.rb +++ /dev/null @@ -1,243 +0,0 @@ -require_relative "test_helper" -require "torch/distributed" -require "socket" -require "timeout" - -class DistributedInitProcessGroupTest < Minitest::Test - def setup - skip "Distributed backend not available" unless Torch::Distributed.available? - skip "CUDA not available for NCCL backend" unless cuda_available? - end - - def test_defaults_nccl_device_id_from_local_rank_env - calls = [] - with_stubbed_init_process_group(calls) do - ENV["LOCAL_RANK"] = "2" - Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 5, world_size: 8) - ensure - ENV.delete("LOCAL_RANK") - end - - assert_equal 1, calls.size - assert_equal 2, calls.first[:device_id] - end - - def test_falls_back_to_local_world_size_modulo - calls = [] - with_stubbed_init_process_group(calls) do - ENV["LOCAL_WORLD_SIZE"] = "2" - Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 3, world_size: 4) - ensure - ENV.delete("LOCAL_WORLD_SIZE") - end - - assert_equal 1, calls.size - assert_equal 1, calls.first[:device_id] - end - - def test_uses_world_size_when_env_missing - calls = [] - with_stubbed_init_process_group(calls) do - Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 1, world_size: 2) - end - - assert_equal 1, calls.size - assert_equal 1, calls.first[:device_id] - end - - private - - # Stub out low-level init to capture arguments without starting a real process group - # Used for upper-level tests that don't require actial process group spawning - def with_stubbed_init_process_group(calls) - original = Torch::Distributed.method(:_init_process_group) - Torch::Distributed.singleton_class.define_method(:_init_process_group) do |backend, store, rank, world_size, timeout_ms, device_id| - calls << {backend: backend, rank: rank, world_size: world_size, timeout_ms: timeout_ms, device_id: device_id} - :stub - end - yield - ensure - Torch::Distributed.singleton_class.define_method(:_init_process_group, original) - end - - def cuda_available? - Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available? - end -end - -class DistributedSpawnStartMethodTest < Minitest::Test - def test_spawn_worker_env_runs_block - reader, writer = IO.pipe - writer.close_on_exec = false - - pid = fork do - reader.close - ENV[Torch::Distributed::SPAWN_ENV_KEY] = "1" - ENV[Torch::Distributed::SPAWN_RANK_ENV_KEY] = "0" - ENV[Torch::Distributed::SPAWN_WORLD_SIZE_ENV_KEY] = "1" - ENV[Torch::Distributed::SPAWN_PORT_ENV_KEY] = "1234" - ENV[Torch::Distributed::SPAWN_PIPE_ENV_KEY] = writer.fileno.to_s - Torch::Distributed.fork_world(1, start_method: :spawn) { |rank, port| [rank, port] } - end - - writer.close - result = Marshal.load(reader) - reader.close - - _pid, status = Process.wait2(pid) - assert status.success? - assert_equal [0, 1234], result - end -end - -class DistributedBackendTest < Minitest::Test - BACKEND = nil - - def setup - super - skip "Distributed backend not available" unless Torch::Distributed.available? - skip "No backend configured for test" unless backend - skip_unless_backend_available! - end - - def backend - self.class::BACKEND - end - - def tensor_options - {} - end - - def skip_unless_backend_available! - skip "#{backend} backend not available" unless backend_available? - end - - def backend_available? - timeout = distributed_timeout - port = Torch::Distributed.free_port - store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 1, true, wait_for_workers: false, timeout: timeout) - Torch::Distributed.init_process_group(backend, store: store, rank: 0, world_size: 1, timeout: timeout) - true - rescue StandardError => e - return false if e.message =~ /not available/i || e.message =~ /unsupported backend/i - raise - ensure - Torch::Distributed.destroy_process_group if Torch::Distributed.initialized? - end - - def fork_with_backend(world_size: 2, start_method: :spawn) - timeout = distributed_timeout - original_filter = ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] - original_script = ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] - ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = name if start_method == :spawn - ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = File.expand_path(__FILE__) if start_method == :spawn - Timeout.timeout(timeout, Timeout::Error, "distributed test exceeded #{timeout}s") do - Torch::Distributed.fork_world(world_size, start_method: start_method) do |rank, port| - Timeout.timeout(timeout, Timeout::Error, "distributed worker #{rank} exceeded #{timeout}s") do - store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?, timeout: timeout) - Torch::Distributed.init_process_group( - backend, - store: store, - rank: rank, - world_size: world_size, - device_id: rank, - timeout: timeout - ) - begin - yield(rank) - ensure - Torch::Distributed.destroy_process_group - end - end - end - end - ensure - ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = original_filter - ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = original_script - end - - def test_all_reduce - results = fork_with_backend do |rank| - tensor = Torch.tensor([rank + 1.0], **tensor_options) - Torch::Distributed.all_reduce(tensor) - tensor.to_a - end - - assert_equal [[3.0], [3.0]], results - end - - def test_barrier - wait_times = fork_with_backend do |rank| - sleep 0.3 if rank.zero? - before = Process.clock_gettime(Process::CLOCK_MONOTONIC) - Torch::Distributed.barrier - after = Process.clock_gettime(Process::CLOCK_MONOTONIC) - after - before - end - - assert_operator wait_times.first, :<, 0.1 - assert_operator wait_times.last, :>=, 0.25 - end - - def test_broadcast - tensors = fork_with_backend do |rank| - tensor = Torch.tensor([rank + 1.0], **tensor_options) - Torch::Distributed.broadcast(tensor, src: 0) - tensor.to_a - end - - assert_equal [[1.0], [1.0]], tensors - end - - def test_ddp_gradient_sync - # autograd cannot run safely with fork-based multiprocessing; always use spawn here - grads = fork_with_backend(start_method: :spawn) do |rank| - device = tensor_options[:device] - model = Torch::NN::Linear.new(1, 1, bias: false) - model = model.to(device) if device - ddp = Torch::NN::Parallel::DistributedDataParallel.new(model) - input = Torch.tensor([[rank + 1.0]], **tensor_options) - output = ddp.call(input) - loss = output.sum - loss.backward - - grad = model.parameters.first.grad - grad = grad.to("cpu") if device - grad.item - end - - grads.each do |grad| - assert_in_delta 1.5, grad, 1e-6 - end - end - - def distributed_timeout - Integer(ENV.fetch("TORCH_DISTRIBUTED_TEST_TIMEOUT", "30")) - end -end - -class DistributedGlooTest < DistributedBackendTest - BACKEND = "gloo" - - def fork_with_backend(world_size: 2, start_method: :fork) - super(world_size: world_size, start_method: start_method) - end -end - -class DistributedNcclTest < DistributedBackendTest - BACKEND = "nccl" - - def setup - skip "CUDA not available for NCCL backend" unless Torch.const_defined?(:CUDA) && Torch::CUDA.available? - skip "Need at least 2 CUDA devices for NCCL tests" unless Torch::CUDA.device_count >= 2 - super - end - - def tensor_options - {device: "cuda"} - end - - def fork_with_backend(world_size: 2, start_method: :spawn) - super(world_size: world_size, start_method: start_method) - end -end