From 171f8ec7c07716bbe09a448bd2c0d8cb4816fc3d Mon Sep 17 00:00:00 2001
From: Andrew Kane <andrew@chartkick.com>
Date: Tue, 27 Oct 2020 15:09:25 -0700
Subject: [PATCH 01/28] Updated LibTorch to 1.7.0

---
 .travis.yml                   |    2 +-
 codegen/generate_functions.rb |   10 +-
 codegen/native_functions.yaml | 3077 +++++++++++++++++++++++++--------
 ext/torch/ext.cpp             |    4 +-
 ext/torch/ruby_arg_parser.h   |   28 +-
 ext/torch/templates.h         |    1 +
 ext/torch/wrap_outputs.h      |    7 +
 7 files changed, 2409 insertions(+), 720 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index a2c40c9f..160cb095 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,7 +8,7 @@ jobs:
     - rvm: 2.5
       dist: xenial
 before_install:
-  - export LIBTORCH_VERSION=1.6.0
+  - export LIBTORCH_VERSION=1.7.0
   - ./test/ci/install_libtorch.sh
 cache:
   bundler: true
diff --git a/codegen/generate_functions.rb b/codegen/generate_functions.rb
index 16e7c075..8d18df89 100644
--- a/codegen/generate_functions.rb
+++ b/codegen/generate_functions.rb
@@ -328,6 +328,8 @@ def generate_function_params(function, params, remove_self)
         "tensorlist"
       when /\Aint\[/
         "intlist"
+      when "float[]"
+        "doublelist"
       when "Scalar"
         "scalar"
       when "bool"
@@ -419,6 +421,8 @@ def generate_dispatch_params(function, params)
         "double"
       when /\Aint\[/
         "IntArrayRef"
+      when "float[]"
+        "ArrayRef<double>"
       when "str"
         "std::string"
       when "Scalar", "bool", "ScalarType", "Layout", "Device", "Storage", "Generator", "MemoryFormat", "Storage"
@@ -466,7 +470,9 @@ def generate_dispatch_retval(function)
   when ["Tensor", "Tensor", "Tensor", "Tensor", "Tensor"]
     "std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor>"
   when ["Tensor", "Tensor", "float", "int"]
-    "std::tuple<Tensor,Tensor,float,int>"
+    "std::tuple<Tensor,Tensor,double,int>"
+  when ["float", "float"]
+    "std::tuple<double,double>"
   else
     raise "Unknown retvals: #{types}"
   end
@@ -539,6 +545,8 @@ def signature_type(param)
       "std::string"
     when "Scalar", "Dimname", "bool", "ScalarType", "Layout", "Device", "Generator", "MemoryFormat", "Storage"
       param[:type]
+    when "float[]"
+      "ArrayRef<double>"
     else
       raise "Unknown type: #{param[:type]}"
     end
diff --git a/codegen/native_functions.yaml b/codegen/native_functions.yaml
index 859c8773..4d748250 100644
--- a/codegen/native_functions.yaml
+++ b/codegen/native_functions.yaml
@@ -47,6 +47,7 @@
 
 # Computes the gradient of current tensor w.r.t. graph leaves.
 - func: backward(Tensor self, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
+  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
@@ -94,6 +95,7 @@
   variants: method
 
 - func: requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!)
+  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
@@ -125,12 +127,6 @@
 - func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)
   variants: method
 
-- func: unflatten.Dimname(Tensor self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor
-  variants: method
-
-- func: unflatten.int(Tensor self, int dim, int[] sizes, Dimname[] names) -> Tensor
-  variants: method
-
 - func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool
   use_c10_dispatcher: full
   dispatch:
@@ -150,14 +146,17 @@
     CUDA: _cudnn_rnn_flatten_weight
 
 - func: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: _cudnn_rnn
 
 - func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+  use_c10_dispatcher: full
   dispatch:
     CUDA: _cudnn_rnn_backward
 
-- func: _cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
+- func: _cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CUDA: _cudnn_init_dropout_state
 
@@ -168,21 +167,25 @@
 - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
   variants: function
   dispatch:
-     CUDA: fused_dropout_cuda
+    CUDA: fused_dropout_cuda
 
 - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor
   use_c10_dispatcher: full
   variants: function
   dispatch:
-     CUDA: masked_scale_cuda
+    CUDA: masked_scale_cuda
 
 - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
 
 - func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!)
+  use_c10_dispatcher: full
 
 - func: _sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!)
+  use_c10_dispatcher: full
 
 - func: _sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!)
+  use_c10_dispatcher: full
 
 - func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor
   use_c10_dispatcher: full
@@ -194,54 +197,81 @@
   use_c10_dispatcher: full
 
 - func: dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  use_c10_dispatcher: full
 
 - func: feature_dropout(Tensor input, float p, bool train) -> Tensor
   use_c10_dispatcher: full
 
 - func: feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  use_c10_dispatcher: full
 
 - func: alpha_dropout(Tensor input, float p, bool train) -> Tensor
   use_c10_dispatcher: full
 
 - func: alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  use_c10_dispatcher: full
 
 - func: feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor
   use_c10_dispatcher: full
 
 - func: feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+  use_c10_dispatcher: full
 
 - func: abs(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: abs_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: abs_out
+
+# Note [Adding an alias]
+# To add an alias do the following:
+#
+# 1) Copy the original functions native_functions.yaml entry, but replace the
+#      original function's name with their own and delete any dispatch
+#      keys for the aliases. Specifying a dispatch key will prevent
+#      autograd from recording the operations the alias performs, which
+#      will stop it from "inheriting" the original operation's autograd behavior.
+# 2) Implement the corresponding functions and have them redispatch to the
+#      original function.
+# 3) Add entries for the alias (and original function, if needed) to
+#      aten/src/ATen/core/interned_strings.h
+#      (This may require removing an entry from ATen/core/aten_interned_strings.h.)
+# 4) Add docstrings to the new function that reference the original function,
+#      and document the method as usual (if it exists.)
+#    (See torch/_torch_docs.py and docs/source/torch.rst if adding a function,
+#     torch/_tensor_docs.py and docs/source/tensors.rst if adding a method,
+#     or module-specific doc bindings (like torch/linalg/__init__.py) if
+#     adding an alias in a namespace.)
+# 5) Update torch/overrides.py consistent with the original function.
+# 6) Update the alias_map in torch/csrc/jit/passes/normalize_ops.cpp.
+# 7) Add entries to test/test_op_aliases.py's "alias_infos"
+#
+# See torch.absolute, an alias for torch.abs, as an example.
 
+# Absolute, alias for abs
 - func: absolute(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
-  dispatch:
-    CPU: abs
-    CUDA: abs
 
 - func: absolute_(Tensor(a!) self) -> Tensor(a!)
-  variants: function, method
-  dispatch:
-    CPU: abs_
-    CUDA: abs_
+  use_c10_dispatcher: full
+  variants: method
 
 - func: absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU: abs_out
-    CUDA: abs_out
 
 - func: angle(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: angle_out
 
 - func: view_as_real(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
@@ -251,6 +281,17 @@
   use_c10_dispatcher: full
   variants: function
 
+- func: sgn(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: sgn_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+
+- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sgn_out
+
 - func: real(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
   variants: function
@@ -264,15 +305,35 @@
   variants: function, method
 
 - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: conj_out
+
+- func: _conj(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
 
 - func: acos(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: acos_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: acos_out
+
+# arccos, alias of acos
+- func: arccos(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arccos_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
   use_c10_dispatcher: full
@@ -288,48 +349,69 @@
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: add
-    CUDA: add
-    SparseCPU: add_sparse
-    SparseCUDA: add_sparse
+    CPU, CUDA: add
+    SparseCPU, SparseCUDA: add_sparse
     MkldnnCPU: mkldnn_add
-    Vulkan: vulkan_add
 
 - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: add_
-    CUDA: add_
-    SparseCPU: add_sparse_
-    SparseCUDA: add_sparse_
+    CPU, CUDA: add_
+    SparseCPU, SparseCUDA: add_sparse_
     MkldnnCPU: mkldnn_add_
 
 - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: add_out
-    CUDA: add_out
+    CPU, CUDA: add_out
     SparseCPU: add_out_sparse_cpu
     SparseCUDA: add_out_sparse_cuda
     MkldnnCPU: mkldnn_add_out
 
+- func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    CPU: add_relu
+
+- func: _add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    CPU: add_relu_
+
+- func: _add_relu.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CPU: add_relu_out
+
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: addmv
 
 - func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: addmv_
 
 - func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addmv_out
 
 - func: _addmv_impl_(Tensor(a!) self, Tensor self2, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   dispatch:
     CPU: addmv_impl_cpu
     CUDA: addmv_impl_cuda
@@ -339,6 +421,7 @@
   variants: function, method
 
 - func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -354,8 +437,12 @@
 - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: all
 
 - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: all_out
 
 - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   variants: function, method
@@ -369,8 +456,12 @@
 - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: any
 
 - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: any_out
 
 - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   variants: function, method
@@ -378,10 +469,13 @@
 - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)
 
@@ -402,60 +496,89 @@
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: argmax
-    CUDA: argmax
+    CPU, CUDA: argmax
 
 - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: argmin
-    CUDA: argmin
+    CPU, CUDA: argmin
 
 - func: acosh(Tensor self) -> Tensor
   use_c10_dispatcher: full
-  supports_named_tensor: True
   variants: function, method
 
 - func: acosh_(Tensor(a!) self) -> Tensor(a!)
-  supports_named_tensor: True
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  supports_named_tensor: True
+  dispatch:
+    CPU, CUDA: acosh_out
+
+# arccosh, alias for acosh
+- func: arccosh(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arccosh_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: asinh(Tensor self) -> Tensor
   use_c10_dispatcher: full
-  supports_named_tensor: True
   variants: function, method
 
 - func: asinh_(Tensor(a!) self) -> Tensor(a!)
-  supports_named_tensor: True
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  supports_named_tensor: True
+  dispatch:
+    CPU, CUDA: asinh_out
+
+# arcsinh, alias for asinh
+- func: arcsinh(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arcsinh_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: atanh(Tensor self) -> Tensor
   use_c10_dispatcher: full
-  supports_named_tensor: True
   variants: function, method
 
 - func: atanh_(Tensor(a!) self) -> Tensor(a!)
-  supports_named_tensor: True
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  supports_named_tensor: True
+  dispatch:
+    CPU, CUDA: atanh_out
+
+# arctanh, alias for atanh
+- func: arctanh(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arctanh_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: as_strided(Tensor(a) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a)
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: as_strided_tensorimpl
-    CUDA: as_strided_tensorimpl
-    QuantizedCPU: as_strided_qtensorimpl
-    QuantizedCUDA: as_strided_qtensorimpl
+    CPU, CUDA: as_strided_tensorimpl
+    QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
   device_guard: False
 
 - func: as_strided_(Tensor(a!) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a!)
@@ -467,18 +590,73 @@
   variants: function, method
 
 - func: asin_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: asin_
+    SparseCPU, SparseCUDA: asin_sparse_
 
 - func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: asin_out
+    SparseCPU, SparseCUDA: asin_out_sparse
+
+# arcsin, alias of asin
+- func: arcsin(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arcsin_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: atan(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: atan_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: atan_out
+
+# arctan, alias of atan
+- func: arctan(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arctan_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: atleast_1d(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+
+- func: atleast_1d.Sequence(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
+
+- func: atleast_2d(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+
+- func: atleast_2d.Sequence(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
+  variants: function
+
+- func: atleast_3d(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+
+- func: atleast_3d.Sequence(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
+  variants: function
 
 - func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
@@ -488,12 +666,14 @@
     CUDA: baddbmm_cuda
 
 - func: baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: baddbmm__cpu
     CUDA: baddbmm__cuda
 
 - func: _baddbmm_mkl_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function
 
 - func: baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -503,19 +683,24 @@
     CUDA: baddbmm_out_cuda
 
 - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor
+  use_c10_dispatcher: full
 
 - func: quantized_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor
-  requires_tensor: True
+  use_c10_dispatcher: full
   dispatch:
     QuantizedCPU: quantized_batch_norm
 
 - func: _batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int)
+  use_c10_dispatcher: full
 
 - func: _batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
 
 # Sample bernoulli with values in `self` as probability.
 - func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor
@@ -523,12 +708,18 @@
 
 - func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   variants: function
+  dispatch:
+    CPU, CUDA: bernoulli_out
 
 - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: bernoulli_
 
 - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: bernoulli_
 
 # This out-of-place version isn't used explicitly, but needed by jit.
 # There is no default valid on `p` here because it would introduce ambiguity
@@ -537,8 +728,10 @@
   variants: function, method
 
 - func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor
+  use_c10_dispatcher: full
 
 - func: binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   variants: function
   dispatch:
@@ -553,6 +746,7 @@
     CUDA: binary_cross_entropy_out_cuda
 
 - func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   variants: function
   dispatch:
@@ -567,12 +761,15 @@
     CUDA: binary_cross_entropy_backward_out_cuda
 
 - func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
+  use_c10_dispatcher: full
   variants: function
 
 - func: binary_cross_entropy_with_logits_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
+  use_c10_dispatcher: full
   variants: function
 
 - func: bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: _bincount_cpu
@@ -583,64 +780,66 @@
   variants: function, method
 
 - func: bitwise_not_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: bitwise_not_out
-    CUDA: bitwise_not_out
+    CPU, CUDA: bitwise_not_out
 
 - func: logical_not(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: logical_not_out
-    CUDA: logical_not_out
+    CPU, CUDA: logical_not_out
 
 - func: logical_xor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: logical_xor_out
-    CUDA: logical_xor_out
+    CPU, CUDA: logical_xor_out
 
 - func: logical_and(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: logical_and_out
-    CUDA: logical_and_out
+    CPU, CUDA: logical_and_out
 
 - func: logical_or(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: logical_or_out
-    CUDA: logical_or_out
+    CPU, CUDA: logical_or_out
 
 - func: blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: bmm(Tensor self, Tensor mat2) -> Tensor
   use_c10_dispatcher: full
@@ -692,17 +891,22 @@
   variants: function, method
 
 - func: ceil_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: ceil_out
-    CUDA: ceil_out
+    CPU, CUDA: ceil_out
 
 - func: chain_matmul(Tensor[] matrices) -> Tensor
   use_c10_dispatcher: full
   variants: function
 
+- func: unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[]
+  use_c10_dispatcher: full
+  variants: function, method
+  device_guard: False
+
 - func: chunk(Tensor(a) self, int chunks, int dim=0) -> Tensor(a)[]
   use_c10_dispatcher: full
   variants: function, method
@@ -712,63 +916,108 @@
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: clamp
-    CUDA: clamp
-    QuantizedCPU: quantized_clamp
-    Vulkan: vulkan_clamp
+    CPU, CUDA: clamp
+    QuantizedCPU: clamp_quantized_cpu
 
 - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: clamp_out
 
 - func: clamp_max(Tensor self, Scalar max) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: clamp_max_out
 
 - func: clamp_min(Tensor self, Scalar min) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: clamp_min_out
+
+# clip is an alias for clamp
+- func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
+  variants: function, method
+
+- func: clip.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: cudnn_is_acceptable(Tensor self) -> bool
   use_c10_dispatcher: full
   device_guard: False
 
+- func: complex(Tensor real, Tensor imag) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+
+- func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: complex_out
+
+- func: polar(Tensor abs, Tensor angle) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+
+- func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: polar_out
+
 - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
   use_c10_dispatcher: full
   variants: function
 
-- func: contiguous(Tensor self, *, MemoryFormat memory_format=contiguous_format) -> Tensor
+- func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)
+  use_c10_dispatcher: full
   variants: method
 
 - func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
+  use_c10_dispatcher: full
 
 - func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
+  use_c10_dispatcher: full
 
 - func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
   use_c10_dispatcher: full
 
-- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor
+- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+  use_c10_dispatcher: full
+
+- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _convolution_nogroup(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding) -> Tensor
+  use_c10_dispatcher: full
 
-- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
 
 - func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor
+  use_c10_dispatcher: full
 
 - func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor
+  use_c10_dispatcher: full
 
 - func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor
+  use_c10_dispatcher: full
 
 - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
   use_c10_dispatcher: full
@@ -778,13 +1027,16 @@
 
 # NB: we inherit the goofy argument order from PyTorch torch.nn.functional
 - func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor
+  use_c10_dispatcher: full
 
 - func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor
+  use_c10_dispatcher: full
 
 - func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor
+  use_c10_dispatcher: full
 
 - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
-  manual_kernel_registration: True
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
@@ -797,22 +1049,38 @@
   variants: function, method
 
 - func: cos_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cos_out
 
 - func: cosh(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: cosh_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cosh_out
 
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
 
+- func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU, CUDA: count_nonzero
+
+- func: count_nonzero(Tensor self, int? dim=None) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
 - func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid
   use_c10_dispatcher: full
   dispatch:
@@ -825,60 +1093,74 @@
     CUDA: cudnn_affine_grid_generator_backward
 
 - func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_batch_norm
 
 # NB: You can only use this if you used cudnn_batch_norm training=True
 - func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_batch_norm_backward
 
 - func: cudnn_convolution.deprecated(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_deprecated
 
-- func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: cudnn_convolution.deprecated2(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    CUDA: cudnn_convolution_deprecated2
+
+- func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution
 
-- func: cudnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: cudnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_backward_input
 
-- func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[2] output_mask) -> (Tensor, Tensor)
+- func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_backward
 
-- func: cudnn_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: cudnn_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_backward_weight
 
 - func: cudnn_convolution_transpose.deprecated(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_deprecated
 
-- func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: cudnn_convolution_transpose.deprecated2(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    CUDA: cudnn_convolution_transpose_deprecated2
+
+- func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose
 
 # NB: output_padding not strictly needed here, but it's helpful for the float
 # backwards
-- func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[2] output_mask) -> (Tensor, Tensor)
+- func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_backward
 
-- func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_backward_input
 
-- func: cudnn_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: cudnn_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_backward_weight
@@ -928,7 +1210,13 @@
     CPU: cummin_helper_cpu
     CUDA: cummin_helper_cuda
 
+- func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
+
 - func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
@@ -938,7 +1226,13 @@
 
 - func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
 
+- func: cumprod_backward(Tensor grad, Tensor input, int dim) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
+
 - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
@@ -958,7 +1252,7 @@
 - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   dispatch:
-    CPU:  ctc_loss_cpu
+    CPU: ctc_loss_cpu
     CUDA: ctc_loss_gpu
 
 - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
@@ -967,10 +1261,6 @@
     CPU: ctc_loss_backward_cpu
     CUDA: ctc_loss_backward_gpu
 
-- func: det(Tensor self) -> Tensor
-  use_c10_dispatcher: full
-  variants: function, method
-
 - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
@@ -986,32 +1276,33 @@
 - func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a)
   variants: function, method
 
+- func: diagonal_backward(Tensor grad, int[] input_sizes, int offset, int dim1, int dim2) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
+
 - func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: div.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: div
-    CUDA: div
-    SparseCPU: div_sparse
-    SparseCUDA: div_sparse
+    CPU, CUDA: div
+    SparseCPU, SparseCUDA: div_sparse
 
 - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: div_
-    CUDA: div_
-    SparseCPU: div_sparse_
-    SparseCUDA: div_sparse_
+    CPU, CUDA: div_
+    SparseCPU, SparseCUDA: div_sparse_
 
 - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: div_out
-    CUDA: div_out
-    SparseCPU: div_out_sparse_zerodim
-    SparseCUDA: div_out_sparse_zerodim
+    CPU, CUDA: div_out
+    SparseCPU, SparseCUDA: div_out_sparse_zerodim
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: div.Scalar(Tensor self, Scalar other) -> Tensor
@@ -1019,24 +1310,72 @@
   variants: function, method
 
 - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
-- func: dot(Tensor self, Tensor tensor) -> Tensor
+# divide, alias for div
+- func: divide.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
-  dispatch:
-    CPU: legacy::cpu::_th_dot
-    CUDA: legacy::cuda::_th_dot
-
-- func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)
 
-- func: einsum(str equation, Tensor[] tensors) -> Tensor
+- func: divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   use_c10_dispatcher: full
+  variants: method
 
-- func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
-  use_c10_dispatcher: full
+- func: divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
 
-- func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
+- func: divide.Scalar(Tensor self, Scalar other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+  # true_divide, an alias for div
+- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: dot(Tensor self, Tensor tensor) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU: dot
+    CUDA: dot_cuda
+
+- func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: vdot(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU: vdot
+    CUDA: vdot_cuda
+
+- func: vdot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: einsum(str equation, Tensor[] tensors) -> Tensor
+  use_c10_dispatcher: full
+
+- func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
+  use_c10_dispatcher: full
+
+- func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
   use_c10_dispatcher: full
 
 - func: embedding_dense_backward(Tensor grad_output, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
@@ -1046,6 +1385,7 @@
     CUDA: embedding_dense_backward_cuda
 
 - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
+  use_c10_dispatcher: full
   dispatch:
     CPU: embedding_renorm_cpu_
     CUDA: embedding_renorm_cuda_
@@ -1062,18 +1402,30 @@
 # applying indices = indices.contiguous().
 # The backward functions apply a check that these input tensors are contiguous.
 
+
+- func: _embedding_bag_forward_only(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
+  dispatch:
+    CPU: _embedding_bag_forward_only_cpu
+    CUDA: _embedding_bag_forward_only_cuda
+
 - func: embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
 
 - func: _embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CPU: _embedding_bag_cpu
     CUDA: _embedding_bag_cuda
 
 - func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, int num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, int num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, int num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: _embedding_bag_dense_backward_cpu
     CUDA: _embedding_bag_dense_backward_cuda
@@ -1085,118 +1437,150 @@
     CUDA: _embedding_bag_per_sample_weights_backward_cuda
 
 - func: empty_meta(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  #use_c10_dispatcher: full
 
 - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
   device_guard: False
 
 - func: empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  #use_c10_dispatcher: full
   dispatch:
     CPU: empty_cpu
     CUDA: empty_cuda
     MkldnnCPU: empty_mkldnn
-    SparseCPU: empty_sparse
-    SparseCUDA: empty_sparse
-    Vulkan: empty_vulkan
+    SparseCPU, SparseCUDA: empty_sparse
 
 - func: new_empty(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  #use_c10_dispatcher: full
   variants: method
 
 - func: new_full(Tensor self, int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   variants: method
 
 - func: new_zeros(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   variants: method
 
 # other overrides are to provide a more helpful error message that dtype is required
 - func: _empty_affine_quantized(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: empty_affine_quantized_other_backends_stub
-    QuantizedCPU: empty_affine_quantized
-    QuantizedCUDA: empty_affine_quantized
+    QuantizedCPU, QuantizedCUDA: empty_affine_quantized
 
 # it's a factory function receiving a tensor argument, thus overriding explicitly
 # other overrides are to provide a more helpful error message that dtype is required
 - func: _empty_per_channel_affine_quantized(int[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+  use_c10_dispatcher: full
   category_override: factory
   dispatch:
     CPU: empty_per_channel_affine_quantized_other_backends_stub
-    QuantizedCPU: empty_per_channel_affine_quantized_cpu
+    QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized
 
 - func: resize_(Tensor(a!) self, int[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)
-  manual_kernel_registration: True
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
+  dispatch:
+    CPU: resize_
+    CUDA: resize_cuda_
+    QuantizedCPU: quantized_resize_cpu_
 
 - func: empty_quantized(int[] size, Tensor qtensor) -> Tensor
+  use_c10_dispatcher: full
   variants: function
   dispatch:
-    QuantizedCPU: empty_quantized
-    QuantizedCUDA: empty_quantized
+    QuantizedCPU, QuantizedCUDA: empty_quantized
 
 - func: empty.out(int[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
   device_guard: False
 
 - func: empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
   device_guard: False
 
 - func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: empty_strided_cpu
     CUDA: empty_strided_cuda
-    Vulkan: empty_strided_vulkan
 
 - func: erf(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: erf_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: erf_out
 
 - func: erfc(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: erfc_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: erfc_out
 
 - func: exp(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: exp_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: exp_out
+
+- func: exp2(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: exp2_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: exp2_out
 
 - func: expm1(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: expm1_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: expm1_out
-    CUDA: expm1_out
+    CPU, CUDA: expm1_out
 
 - func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
   use_c10_dispatcher: full
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   device_guard: False
 
-- func: expand_as(Tensor self, Tensor other) -> Tensor
+- func: expand_as(Tensor(a) self, Tensor other) -> Tensor(a)
   use_c10_dispatcher: full
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   device_guard: False
 
 - func: eye(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: eye.m(int n, int m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: eye.out(int n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -1208,23 +1592,31 @@
     CPU: eye_out_cpu
     CUDA: eye_out_cuda
 
-- func: flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> Tensor
+- func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
   use_c10_dispatcher: full
   variants: function, method
 
-- func: flatten.named_out_dim(Tensor self, int start_dim, int end_dim, Dimname out_dim) -> Tensor
+- func: flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a)
   variants: function, method
 
-- func: flatten.using_names(Tensor self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor
+- func: flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a)
   variants: function, method
 
-- func: flatten.DimnameList(Tensor self, Dimname[] dims, Dimname out_dim) -> Tensor
+- func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)
   variants: function, method
 
+- func: unflatten.int(Tensor(a) self, int dim, int[] sizes, Dimname[]? names=None) -> Tensor(a)
+  variants: method
+
+- func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a)
+  variants: method
+
 - func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: floor(Tensor self) -> Tensor
@@ -1232,42 +1624,38 @@
   variants: function, method
 
 - func: floor_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: floor_out
-    CUDA: floor_out
+    CPU, CUDA: floor_out
 
 - func: floor_divide(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: floor_divide
-    CUDA: floor_divide
-    SparseCPU: floor_divide_sparse
-    SparseCUDA: floor_divide_sparse
+    CPU, CUDA: floor_divide
+    SparseCPU, SparseCUDA: floor_divide_sparse
 
 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: floor_divide_
-    CUDA: floor_divide_
-    SparseCPU: floor_divide_sparse_
-    SparseCUDA: floor_divide_sparse_
+    CPU, CUDA: floor_divide_
+    SparseCPU, SparseCUDA: floor_divide_sparse_
 
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: floor_divide_out
-    CUDA: floor_divide_out
-    SparseCPU: floor_divide_out_sparse_zerodim
-    SparseCUDA: floor_divide_out_sparse_zerodim
+    CPU, CUDA: floor_divide_out
+    SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
 
 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: frac(Tensor self) -> Tensor
@@ -1275,29 +1663,63 @@
   variants: function, method
 
 - func: frac_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: frac_out
 
 - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   device_guard: False
 
 - func: full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: full.out(int[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: from_file
 
+- func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: gcd_out
+
+- func: gcd(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: lcm_out
+
+- func: lcm(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
 # NOTE [ grid_sampler Native Functions ]
 # `grid_sampler` does all the shape checking and then dispatches to one of
 # `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which
 # has the corresponding backward defined as native functions as well. Therefore,
 # in these functions and their backwards, no more shape checking is done.
 #
+# There is also _grid_sampler_2d_backward_cpu_fallback which is an
+# implementation detail of grid_sampler_2d and is only exposed here for testing
+# purposes.
+#
 # Additionally, arguments `padding_mode` and `interpolation_mode` are cast to
 # enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in
 # `interpolation_mode` because it only supports Bilinear interpolation mode.
@@ -1318,6 +1740,13 @@
     CPU: grid_sampler_2d_backward_cpu
     CUDA: grid_sampler_2d_backward_cuda
 
+# See NOTE [ grid_sample CPU fallback ]
+- func: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+  use_c10_dispatcher: full
+
+- func: _grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
+
 - func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
   use_c10_dispatcher: full
   dispatch:
@@ -1331,43 +1760,48 @@
     CUDA: grid_sampler_3d_backward_cuda
 
 - func: hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
-- func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor
+- func: kaiser_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   use_c10_dispatcher: full
 
-- func: ger(Tensor self, Tensor vec2) -> Tensor
+- func: kaiser_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   use_c10_dispatcher: full
-  variants: function, method
 
-- func: ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+- func: kaiser_window.beta(int window_length, bool periodic, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
+
+- func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor
+  use_c10_dispatcher: full
 
 - func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+  use_c10_dispatcher: full
 
 - func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, int N, int C, int HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
-    CPU: native_group_norm
-    CUDA: native_group_norm
+    CPU, CUDA: native_group_norm
+    Math: math_group_norm
 
 - func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int N, int C, int HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  dispatch:
-    CPU: native_group_norm_backward
-    CUDA: native_group_norm_backward
-
-# FFT
-
-- func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor
   use_c10_dispatcher: full
-  variants: function, method
+  dispatch:
+    CPU, CUDA: native_group_norm_backward
 
 - func: ifft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor
   use_c10_dispatcher: full
@@ -1384,6 +1818,10 @@
 - func: _fft_with_size(Tensor self, int signal_ndim, bool complex_input, bool complex_output, bool inverse, int[] checked_signal_sizes, bool normalized, bool onesided, int[] output_sizes) -> Tensor
   use_c10_dispatcher: full
   variants: function
+
+- func: _fft_with_size.norm_modes(Tensor self, int signal_ndim, bool complex_input, bool complex_output, bool inverse, int[] checked_signal_sizes, int normalization, bool onesided, int[] output_sizes) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
   dispatch:
     CPU: _fft_mkl
     CUDA: _fft_cufft
@@ -1402,12 +1840,15 @@
 
 - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
   variants: function, method
+  dispatch:
+    CPU, CUDA: index
   # NB: This function is special-cased in tools/autograd/gen_variable_type.py
   # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
   # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
   # - Tensor Tensor::index(std::initializer_list<TensorIndex> indices)
 
 - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
@@ -1433,8 +1874,11 @@
 
 - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
   variants: function
+  dispatch:
+    CPU, CUDA: _index_put_impl_
 
 - func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor
+  use_c10_dispatcher: full
   variants: function
 
 - func: inverse(Tensor self) -> Tensor
@@ -1459,10 +1903,8 @@
   variants: function, method
   device_guard: False
   dispatch:
-    CPU: isnan
-    CUDA: isnan
-    SparseCPU: isnan_sparse
-    SparseCUDA: isnan_sparse
+    CPU, CUDA: isnan
+    SparseCPU, SparseCUDA: isnan_sparse
 
 - func: is_distributed(Tensor self) -> bool
   use_c10_dispatcher: full
@@ -1479,6 +1921,10 @@
   variants: function, method
   device_guard: False
 
+- func: isreal(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
 - func: is_nonzero(Tensor self) -> bool
   use_c10_dispatcher: full
   variants: function, method
@@ -1518,21 +1964,26 @@
 - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
 
 - func: layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
+  use_c10_dispatcher: full
 
 - func: native_layer_norm(Tensor input, Tensor? weight, Tensor? bias, int M, int N, float eps) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CPU: layer_norm_cpu
     CUDA: layer_norm_cuda
 
 - func: native_layer_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int M, int N, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CPU: layer_norm_backward_cpu
     CUDA: layer_norm_backward_cuda
 
 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
 
 - func: mkldnn_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     MkldnnCPU: mkldnn_linear
@@ -1561,9 +2012,10 @@
 - func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor
   use_c10_dispatcher: full
 
-- func: linspace(Scalar start, Scalar end, int steps=100, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
-- func: linspace.out(Scalar start, Scalar end, int steps=100, *, Tensor(a!) out) -> Tensor(a!)
+- func: linspace.out(Scalar start, Scalar end, int? steps=None, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: linspace_cpu_out
     CUDA: linspace_cuda_out
@@ -1573,63 +2025,64 @@
   variants: function, method
 
 - func: log_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: log_out
-    CUDA: log_out
+    CPU, CUDA: log_out
 
 - func: log10(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: log10_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: log10_out
-    CUDA: log10_out
+    CPU, CUDA: log10_out
 
 - func: log1p(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: log1p_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: log1p_
-    CUDA: log1p_
-    SparseCPU: log1p_sparse_
-    SparseCUDA: log1p_sparse_
+    CPU, CUDA: log1p_
+    SparseCPU, SparseCUDA: log1p_sparse_
 
 - func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: log1p_out
-    CUDA: log1p_out
-    SparseCPU: log1p_out_sparse
-    SparseCUDA: log1p_out_sparse
+    CPU, CUDA: log1p_out
+    SparseCPU, SparseCUDA: log1p_out_sparse
 
 - func: log2(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: log2_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: log2_out
-    CUDA: log2_out
+    CPU, CUDA: log2_out
 
 - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logaddexp_out
 
 - func: logaddexp(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
 - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logaddexp2_out
 
 - func: logaddexp2(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -1639,15 +2092,17 @@
   use_c10_dispatcher: full
   variants: function, method
 
-- func: logspace(Scalar start, Scalar end, int steps=100, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: logspace(Scalar start, Scalar end, int? steps=None, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
-- func: logspace.out(Scalar start, Scalar end, int steps=100, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+- func: logspace.out(Scalar start, Scalar end, int? steps=None, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: logspace_cpu_out
     CUDA: logspace_cuda_out
 
 # log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
 - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
@@ -1677,6 +2132,7 @@
     CUDA: _logcumsumexp_out_cuda
 
 - func: logcumsumexp(Tensor self, int dim) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
@@ -1716,24 +2172,61 @@
   use_c10_dispatcher: full
   variants: function, method
 
-- func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+- func: matrix_exp(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: matrix_exp
 
-- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+- func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor
+  use_c10_dispatcher: full
+
+- func: _aminmax(Tensor self) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    CPU, CUDA: _aminmax_all
+
+- func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    CPU, CUDA: _aminmax
+
+- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
+  dispatch:
+    CPU, CUDA: _compute_linear_combination
 
-- func: max_values(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+- func: _compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: _compute_linear_combination_out
+
+- func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   use_c10_dispatcher: full
   variants: function, method
 
+- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CPU, CUDA: max_out
+
 - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
 
 - func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
 
-- func: max_values.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
+- func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, int[] sizes, bool keepdim) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
+
+- func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
+- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: amax_out
+
 # Return: (Tensor output, Tensor indices)
 - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
@@ -1746,13 +2239,21 @@
 
 - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
   use_c10_dispatcher: full
-  requires_tensor: True
   dispatch:
     MkldnnCPU: mkldnn_max_pool2d
 
+- func: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    MkldnnCPU: mkldnn_max_pool3d
+
+- func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    QuantizedCPU: quantized_max_pool1d
+
 - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
   use_c10_dispatcher: full
-  requires_tensor: True
   dispatch:
     QuantizedCPU: quantized_max_pool2d
 
@@ -1762,25 +2263,23 @@
 # The CPU and GPU dispatch variants are named weirdly here because otherwise there
 # are namespacing issues in C++
 - func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: mean_cpu_gpu
-    CUDA: mean_cpu_gpu
-    QuantizedCPU: quantized_mean_cpu
+    CPU, CUDA: mean_cpu_gpu
+    QuantizedCPU: mean_quantized_cpu
 
 - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: mean_cpu_gpu
-    CUDA: mean_cpu_gpu
-    QuantizedCPU: quantized_mean_cpu
-    Vulkan: mean_vulkan
+    CPU, CUDA: mean_cpu_gpu
+    QuantizedCPU: mean_quantized_cpu
 
 - func: mean.out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: mean_out_cpu_gpu
-    CUDA: mean_out_cpu_gpu
-    QuantizedCPU: quantized_mean_out_cpu
+    CPU, CUDA: mean_out_cpu_gpu
+    QuantizedCPU: mean_out_quantized_cpu
 
 - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
@@ -1803,20 +2302,24 @@
   variants: function, method
 
 - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
-
-- func: min_values(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
-  variants: function, method
+  dispatch:
+    CPU, CUDA: min_out
 
 - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
 
 - func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
 
-- func: min_values.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
+- func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
+- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: amin_out
+
 - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+  use_c10_dispatcher: full
 
 - func: mkldnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> Tensor
   use_c10_dispatcher: full
@@ -1828,14 +2331,17 @@
   use_c10_dispatcher: full
 
 - func: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_batch_norm
 
 - func: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_batch_norm_backward
 
 - func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution
 
@@ -1860,6 +2366,7 @@
     CUDA: miopen_convolution_backward_weight
 
 - func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_transpose
 
@@ -1881,6 +2388,7 @@
     CUDA: miopen_convolution_transpose_backward_weight
 
 - func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_depthwise_convolution
 
@@ -1900,10 +2408,12 @@
     CUDA: miopen_depthwise_convolution_backward_weight
 
 - func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_rnn
 
 - func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_rnn_backward
 
@@ -1913,15 +2423,13 @@
   dispatch:
     CPU: mm_cpu
     CUDA: mm_cuda
-    SparseCPU: _sparse_mm
-    SparseCUDA: _sparse_mm
+    SparseCPU, SparseCUDA: _sparse_mm
 
 - func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: mm_cpu_out
     CUDA: mm_out_cuda
-    SparseCPU: _sparse_mm_out
-    SparseCUDA: _sparse_mm_out
+    SparseCPU, SparseCUDA: _sparse_mm_out
 
 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
   use_c10_dispatcher: full
@@ -1929,6 +2437,8 @@
 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: mode
 
 - func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
 
@@ -1941,25 +2451,21 @@
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: mul
-    CUDA: mul
-    SparseCPU: mul_sparse
-    SparseCUDA: mul_sparse
+    CPU, CUDA: mul
+    SparseCPU, SparseCUDA: mul_sparse
     MkldnnCPU: mkldnn_mul
 
 - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: mul_
-    CUDA: mul_
-    SparseCPU: mul_sparse_
-    SparseCUDA: mul_sparse_
+    CPU, CUDA: mul_
+    SparseCPU, SparseCUDA: mul_sparse_
     MkldnnCPU: mkldnn_mul_
 
 - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: mul_out
-    CUDA: mul_out
+    CPU, CUDA: mul_out
     SparseCPU: mul_out_sparse_cpu
     SparseCUDA: mul_out_sparse_cuda
     MkldnnCPU: mkldnn_mul_out
@@ -1970,16 +2476,34 @@
   variants: function, method
 
 - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+# multiply, alias for mul
+- func: multiply.Tensor(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: multiply.Scalar(Tensor self, Scalar other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: mv(Tensor self, Tensor vec) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: mv
-    CUDA: mv
-    SparseCPU: mv_sparse
-    SparseCUDA: mv_sparse
+    CPU, CUDA: mv
+    SparseCPU, SparseCUDA: mv_sparse
 
 - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
 
@@ -1988,16 +2512,15 @@
   variants: function, method
 
 - func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: narrow_copy(Tensor self, int dim, int start, int length) -> Tensor
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: narrow_copy_dense
-    CUDA: narrow_copy_dense
-    SparseCPU: narrow_copy_sparse
-    SparseCUDA: narrow_copy_sparse
+    CPU, CUDA: narrow_copy_dense
+    SparseCPU, SparseCUDA: narrow_copy_sparse
 
 - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
   use_c10_dispatcher: full
@@ -2010,6 +2533,7 @@
   device_guard: False
 
 - func: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CPU: batch_norm_cpu
     CUDA: batch_norm_cuda
@@ -2025,6 +2549,7 @@
     CUDA: batch_norm_stats_cuda
 
 - func: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CUDA: batch_norm_elemt_cuda
 
@@ -2034,27 +2559,33 @@
 
 # for backward compatibility
 - func: batch_norm_gather_stats(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: batch_norm_gather_stats_cuda
 
 - func: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: batch_norm_gather_stats_with_counts_cuda
 
 - func: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CPU: batch_norm_backward_cpu
     CUDA: batch_norm_backward_cuda
 
 - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: batch_norm_backward_reduce_cuda
 
 - func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor mean_dy, Tensor mean_dy_xmu) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CUDA: batch_norm_backward_elemt_cuda
 
 - func: batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CPU: batch_norm_update_stats_cpu
     CUDA: batch_norm_update_stats_cuda
@@ -2066,6 +2597,7 @@
   use_c10_dispatcher: full
 
 - func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
+  use_c10_dispatcher: full
   variants: function
 
 - func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
@@ -2084,10 +2616,12 @@
   device_guard: False
 
 - func: ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: ones.out(int[] size, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
@@ -2100,18 +2634,26 @@
 
 - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _cdist_forward
 
 - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _cdist_backward
 
 - func: pdist(Tensor self, float p=2) -> Tensor
   use_c10_dispatcher: full
 
 - func: _pdist_forward(Tensor self, float p=2) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _pdist_forward
 
 - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _pdist_backward
 
 - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor
   use_c10_dispatcher: full
@@ -2121,6 +2663,14 @@
   use_c10_dispatcher: full
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
 
+- func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a)
+  use_c10_dispatcher: full
+  variants: function, method
+
 # Only exposed from C++ -- in Python,
 # we expose it as an attribute `T`, not a function.
 #
@@ -2139,13 +2689,13 @@
   use_c10_dispatcher: full
   dispatch:
     CPU: channel_shuffle
-    QuantizedCPU: quantized_channel_shuffle
+    QuantizedCPU: channel_shuffle_quantized_cpu
 
 - func: is_pinned(Tensor self) -> bool
   use_c10_dispatcher: full
   variants: method
 
-- func: pin_memory(Tensor self) -> Tensor
+- func: pin_memory(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
   variants: method
 
@@ -2160,28 +2710,25 @@
 - func: rad2deg(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
-  supports_named_tensor: True
 
 - func: rad2deg_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
-  supports_named_tensor: True
 
 - func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  supports_named_tensor: True
 
 - func: deg2rad(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
-  supports_named_tensor: True
 
 - func: deg2rad_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
-  supports_named_tensor: True
 
 - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  supports_named_tensor: True
 
 - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: rand.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   device_guard: False
@@ -2190,6 +2737,7 @@
   device_guard: False
 
 - func: rand(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: rand.generator(int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -2198,12 +2746,15 @@
 - func: rand.generator_out(int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
 
 - func: rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randint(int high, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randint.generator(int high, int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
 - func: randint.low(int low, int high, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randint.low_generator(int low, int high, int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -2216,10 +2767,13 @@
 - func: randint.low_generator_out(int low, int high, int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
 
 - func: randint_like(Tensor self, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randint_like.low_dtype(Tensor self, int low, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randn(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randn.generator(int[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -2234,8 +2788,10 @@
 - func: randn.generator_out(int[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
 
 - func: randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randperm(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: randperm.generator(int n, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -2247,8 +2803,10 @@
     CUDA: randperm_out_cuda
 
 - func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -2260,21 +2818,39 @@
   variants: function, method
 
 - func: reciprocal_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: reciprocal_out
 
 - func: neg(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: neg_
+    SparseCPU, SparseCUDA: neg_sparse_
 
 - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: neg_out
-    CUDA: neg_out
+    CPU, CUDA: neg_out
+    SparseCPU, SparseCUDA: neg_out_sparse
+
+# Alias for neg
+- func: negative(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: negative_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: repeat(Tensor self, int[] repeats) -> Tensor
   use_c10_dispatcher: full
@@ -2295,7 +2871,7 @@
   use_c10_dispatcher: full
   variants: function, method
 
-- func: reshape(Tensor self, int[] shape) -> Tensor
+- func: reshape(Tensor(a) self, int[] shape) -> Tensor(a)
   use_c10_dispatcher: full
   variants: function, method
   device_guard: False
@@ -2303,11 +2879,10 @@
 - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
   use_c10_dispatcher: full
   device_guard: False
-  requires_tensor: True
   dispatch:
     MkldnnCPU: mkldnn_reshape
 
-- func: reshape_as(Tensor self, Tensor other) -> Tensor
+- func: reshape_as(Tensor(a) self, Tensor other) -> Tensor(a)
   use_c10_dispatcher: full
   variants: method
   device_guard: False
@@ -2317,6 +2892,7 @@
   variants: function, method
 
 - func: round_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -2332,18 +2908,17 @@
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: relu
-    CUDA: relu
+    CPU, CUDA: relu
     MkldnnCPU: mkldnn_relu
-    QuantizedCPU: quantized_relu
+    QuantizedCPU: relu_quantized_cpu
 
 - func: relu_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: relu_
-    CUDA: relu_
+    CPU, CUDA: relu_
     MkldnnCPU: mkldnn_relu_
-    QuantizedCPU: quantized_relu_
+    QuantizedCPU: relu_quantized_cpu_
 
 - func: prelu(Tensor self, Tensor weight) -> Tensor
   use_c10_dispatcher: full
@@ -2373,25 +2948,35 @@
     CPU: gelu_backward_cpu
     CUDA: gelu_backward_cuda
 
+- func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  python_module: nn
+  device_guard: False
+
 - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: hardshrink
 
 - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: hardshrink_backward
 
 - func: rsqrt(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: rsqrt_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: rsqrt_out
-    CUDA: rsqrt_out
+    CPU, CUDA: rsqrt_out
 
 - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
   variants: function, method
@@ -2402,54 +2987,98 @@
   variants: function, method
   device_guard: False
 
+- func: select_backward(Tensor grad, int[] input_sizes, int dim, int index) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
+
 - func: selu(Tensor self) -> Tensor
   use_c10_dispatcher: full
 
 - func: selu_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
 
 - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
   use_c10_dispatcher: full
 
 - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
+  use_c10_dispatcher: full
+
+- func: silu(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+
+- func: silu_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
+  python_module: nn
+
+- func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, CUDA: silu_out
+
+- func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
 
 - func: sigmoid(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: sigmoid
-    CUDA: sigmoid
-    QuantizedCPU: quantized_sigmoid
+    CPU, CUDA: sigmoid
+    QuantizedCPU: sigmoid_quantized_cpu
     MkldnnCPU: mkldnn_sigmoid
 
 - func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: sigmoid_
-    CUDA: sigmoid_
+    CPU, CUDA: sigmoid_
     MkldnnCPU: mkldnn_sigmoid_
 
 - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sigmoid_out
+
+- func: logit(Tensor self, float? eps=None) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU, CUDA: logit
+
+- func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU, CUDA: logit_
+
+- func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logit_out
 
 - func: sin(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: sin_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: sin_out
-    CUDA: sin_out
+    CPU, CUDA: sin_out
 
 - func: sinh(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: sinh_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sinh_out
 
 # Returns a copy of this `Variable` that is detached from its autograd graph.
 # This method is OK to call if the `Variable` is a view.
@@ -2462,16 +3091,15 @@
 # to false to make such changes explicitly illegal, in order to prevent users from
 # changing metadata of the detached tensor and expecting the original tensor to also
 # be updated.
-- func: detach(Tensor self) -> Tensor
+- func: detach(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
-  manual_kernel_registration: True
   variants: function, method
 
 # Like `detach()`, but modifies this `Variable` in-place. This method may
 # only be called on non-view `Variable`s. You can use `is_view()` to check
 # this. If this `Variable` is a view, throws an `std::runtime_error()`.
 - func: detach_(Tensor(a!) self) -> Tensor(a!)
-  manual_kernel_registration: True
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: size.int(Tensor self, int dim) -> int
@@ -2488,6 +3116,11 @@
   variants: function, method
   device_guard: False
 
+- func: slice_backward(Tensor grad, int[] input_sizes, int dim, int start, int end, int step) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
+
 - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
   use_c10_dispatcher: full
   variants: function, method
@@ -2498,6 +3131,7 @@
 
 # softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
 - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
@@ -2516,12 +3150,22 @@
     CPU: softmax_backward_cpu
     CUDA: softmax_backward_cuda
 
+- func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[]
+  use_c10_dispatcher: full
+  variants: function, method
+  device_guard: False
+
 - func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]
   use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
-- func: split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]
+- func: unsafe_split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]
+  use_c10_dispatcher: full
+  variants: function, method
+  device_guard: False
+
+- func: split_with_sizes(Tensor(a) self, int[] split_sizes, int dim=0) -> Tensor(a)[]
   use_c10_dispatcher: full
   variants: function, method
   device_guard: False
@@ -2541,10 +3185,12 @@
   device_guard: False
 
 - func: squeeze_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
@@ -2568,14 +3214,31 @@
 
 - func: stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
 
+- func: hstack(Tensor[] tensors) -> Tensor
+  use_c10_dispatcher: full
+
+- func: hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: vstack(Tensor[] tensors) -> Tensor
+  use_c10_dispatcher: full
+
+- func: vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: dstack(Tensor[] tensors) -> Tensor
+  use_c10_dispatcher: full
+
+- func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+
 # The signature is designed to be consistent with librosa except that it is
 # missing the `pad_mode` and `center` arguments, which are taken care of at
 # `torch.functional.py`. They shall be moved here once we have mapping between
 # Python strings and C++ Enum in codegen.
-- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool onesided=True) -> Tensor
+- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
-- func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool onesided=True, int? length=None) -> Tensor
+- func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: stride.int(Tensor self, int dim) -> int
@@ -2588,18 +3251,42 @@
   device_guard: False
 
 - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: sum
 
 - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: sum
 
 - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
 
 - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sum_out
 
 - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
 
+- func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU, CUDA: nansum
+
+- func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU, CUDA: nansum
+
+- func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: nansum_out
+
 - func: sum_to_size(Tensor self, int[] size) -> Tensor
   use_c10_dispatcher: full
   variants: method
@@ -2610,37 +3297,51 @@
   variants: function, method
 
 - func: sqrt_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sqrt_out
 
 - func: square(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: square_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: std(Tensor self, bool unbiased=True) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: std
 
 - func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: std
 
 - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: std_mean
 
 - func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: std_mean
 
 - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   variants: function
 
 - func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: std_out
 
 - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   variants: function, method
@@ -2648,12 +3349,20 @@
 - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: prod
 
 - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: prod
 
 - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: prod_out
 
 - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
@@ -2666,6 +3375,7 @@
   variants: function, method
 
 - func: t_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   device_guard: False
   variants: method
 
@@ -2674,22 +3384,27 @@
   variants: function, method
 
 - func: tan_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: tan_out
 
 - func: tanh(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: tanh
-    CUDA: tanh
-    QuantizedCPU: quantized_tanh
+    CPU, CUDA: tanh
+    QuantizedCPU: tanh_quantized_cpu
 
 - func: tanh_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: tanh_out
 
 - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
   use_c10_dispatcher: full
@@ -2702,9 +3417,10 @@
   dispatch:
     CPU: threshold
     CUDA: threshold_cuda
-    QuantizedCPU: quantized_threshold
+    QuantizedCPU: threshold_quantized_cpu
 
 - func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: threshold_
@@ -2734,17 +3450,17 @@
 - func: _mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor
   use_c10_dispatcher: full
   device_guard: False
-  requires_tensor: True
   dispatch:
     MkldnnCPU: mkldnn_transpose
 
 - func: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+  use_c10_dispatcher: full
   device_guard: False
-  requires_tensor: True
   dispatch:
     MkldnnCPU: mkldnn_transpose_
 
@@ -2775,7 +3491,7 @@
     CPU: roll_cpu
     CUDA: roll_cuda
 
-# default int[] value [0,1] should not add space after comma, since native_parse.py uses ', ' to split args
+# default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
 
 - func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
   use_c10_dispatcher: full
@@ -2793,48 +3509,28 @@
 - func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
 
-- func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
+- func: trunc(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
-  dispatch:
-    CPU: true_divide
-    CUDA: true_divide
-    SparseCPU: true_divide_sparse
-    SparseCUDA: true_divide_sparse
-
-- func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  variants: method
-  dispatch:
-    CPU: true_divide_
-    CUDA: true_divide_
-    SparseCPU: true_divide_sparse_
-    SparseCUDA: true_divide_sparse_
-
-- func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU: true_divide_out
-    CUDA: true_divide_out
-    SparseCPU: true_divide_out_sparse_zerodim
-    SparseCUDA: true_divide_out_sparse_zerodim
 
-- func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor
+- func: trunc_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: function, method
 
-- func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  variants: method
+- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: trunc_out
 
-- func: trunc(Tensor self) -> Tensor
+# Alias for trunc
+- func: fix(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
-- func: trunc_(Tensor(a!) self) -> Tensor(a!)
+- func: fix_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function, method
 
-- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU: trunc_out
-    CUDA: trunc_out
+- func: fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: type_as(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -2892,6 +3588,7 @@
   device_guard: False
 
 - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
@@ -2901,12 +3598,18 @@
 - func: var(Tensor self, bool unbiased=True) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: var
 
 - func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: var
 
 - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: var_out
 
 - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   variants: function, method
@@ -2916,15 +3619,19 @@
 - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: var_mean
 
 - func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: var_mean
 
 - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   variants: function
 
-- func: view_as(Tensor self, Tensor other) -> Tensor
+- func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
   use_c10_dispatcher: full
   variants: method
   device_guard: False
@@ -2936,6 +3643,18 @@
   use_c10_dispatcher: full
   variants: function, method
 
+- func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+
+- func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+
+- func: where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+
 - func: where(Tensor condition) -> Tensor[]
   use_c10_dispatcher: full
   variants: function
@@ -2943,6 +3662,8 @@
 - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: _s_where
 
 - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
   use_c10_dispatcher: full
@@ -2974,10 +3695,12 @@
   device_guard: False
 
 - func: zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: zeros.out(int[] size, *, Tensor(a!) out) -> Tensor(a!)
 
 - func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
   use_c10_dispatcher: full
@@ -3020,27 +3743,34 @@
 - func: native_norm(Tensor self, Scalar p=2) -> Tensor
   use_c10_dispatcher: full
   dispatch:
-    SparseCPU: norm_sparse
-    SparseCUDA: norm_sparse
+    SparseCPU, SparseCUDA: norm_sparse
+
+- func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    SparseCPU, SparseCUDA: norm_sparse
 
 # TODO: reduce signatures down to one when optional args is available
 - func: _sparse_sum(Tensor self) -> Tensor
   use_c10_dispatcher: full
 
 - func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
   use_c10_dispatcher: full
 
 - func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
   use_c10_dispatcher: full
   dispatch:
-      SparseCPU: _sparse_sum_backward_cpu
-      SparseCUDA: _sparse_sum_backward_cuda
+    SparseCPU: _sparse_sum_backward_cpu
+    SparseCUDA: _sparse_sum_backward_cuda
 
 - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function
 
 - func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
@@ -3050,12 +3780,16 @@
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_sparse_cpu
+    SparseCUDA: softmax_sparse_cuda
 
 - func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_backward_sparse_cpu
+    SparseCUDA: softmax_backward_sparse_cuda
 
 - func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function
 
 - func: _sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
@@ -3065,12 +3799,16 @@
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_sparse_cpu
+    SparseCUDA: log_softmax_sparse_cuda
 
 - func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_backward_sparse_cpu
+    SparseCUDA: log_softmax_backward_sparse_cuda
 
 - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
@@ -3078,6 +3816,7 @@
   variants: function, method
 
 - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
 
 - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
@@ -3085,8 +3824,12 @@
   variants: function, method
 
 - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: norm_out
 
 - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: norm_out
 
 - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
   variants: function, method
@@ -3124,68 +3867,44 @@
   variants: function
 
 - func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: clone
-    CUDA: clone
-    SparseCPU: clone_sparse
-    SparseCUDA: clone_sparse
+    CPU, CUDA: clone
+    SparseCPU, SparseCUDA: clone_sparse
     MkldnnCPU: mkldnn_clone
-    QuantizedCPU: quantized_clone
-    QuantizedCUDA: quantized_clone
+    QuantizedCPU, QuantizedCUDA: quantized_clone
 
 - func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!)
-  manual_kernel_registration: True
-  variants: function, method
-
-- func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU: pow_out
-    CUDA: pow_out
-    SparseCPU: pow_out_sparse_scalar
-    SparseCUDA: pow_out_sparse_scalar
-
-- func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
-  dispatch:
-    CPU: pow
-    CUDA: pow
-    SparseCPU: pow_sparse_scalar
-    SparseCUDA: pow_sparse_scalar
 
 - func: zero_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: zero_
-    CUDA: zero_
-    SparseCPU: zero_sparse_
-    SparseCUDA: zero_sparse_
+    CPU, CUDA: zero_
+    SparseCPU, SparseCUDA: zero_sparse_
     MkldnnCPU: mkldnn_zero_
 
 - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: sub_out
-    CUDA: sub_out
-    SparseCPU: sub_out_sparse
-    SparseCUDA: sub_out_sparse
+    CPU, CUDA: sub_out
+    SparseCPU, SparseCUDA: sub_out_sparse
 
 - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    CPU: sub
-    CUDA: sub
-    SparseCPU: sub_sparse
-    SparseCUDA: sub_sparse
+    CPU, CUDA: sub
+    SparseCPU, SparseCUDA: sub_sparse
 
 - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: sub_
-    CUDA: sub_
-    SparseCPU: sub_sparse_
-    SparseCUDA: sub_sparse_
+    CPU, CUDA: sub_
+    SparseCPU, SparseCUDA: sub_sparse_
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
@@ -3193,11 +3912,45 @@
   variants: function, method
 
 - func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+# subtract, alias for sub
+- func: subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+
+- func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+# For C++ only, until we have conversion from C++ numbers to Tensor
+- func: subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: rsub
+
+- func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: heaviside_out
+
+- func: heaviside(Tensor self, Tensor values) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!)
+  variants: method
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
@@ -3224,12 +3977,12 @@
     CUDA: addmm_cuda
     SparseCPU: addmm_sparse_dense_cpu
     SparseCUDA: addmm_sparse_dense_cuda
-    Vulkan: vulkan_addmm
 
 - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: legacy::cpu::_th_addmm_
+    CPU: addmm_cpu_
     CUDA: addmm__cuda
     # Warning!  For whatever reason, the inplace sparse addmm is NON
     # broadcasting
@@ -3255,12 +4008,6 @@
 # using **Tensor** type, and thus lose autograd tracking on the actual method
 # they dispatch to, e.g., `sparse_coo_tensor_with_dims_and_tensors`.
 #
-# The actual ctors `sparse_coo_tensor_with_dims` and `sparse_coo_tensor_with_dims_and_tensors`,
-# on the other hand, need to create `SparseTensorImpl` and know nothing about
-# how `VariableType`s work. So they need to be dispatched using Tensor types.
-# We thus put `requires_tensor=True` to ensure that `VariableType` will unwrap
-# the given variables and call with the Tensor type.
-#
 #
 # Sparse Methods API Design
 # ~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -3353,39 +4100,42 @@
 
 # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
 # the default would never make sense.
-- func: sparse_coo_tensor.size(int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
+- func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  use_c10_dispatcher: full
 
 - func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
+
+- func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> ()
+  use_c10_dispatcher: full
 
-- func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
+- func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
-    SparseCPU: new_with_dims_sparse
-    SparseCUDA: new_with_dims_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: new_with_dims_sparse
 
-- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, int[] size, Tensor indices, Tensor values, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
+- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, int[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
-    SparseCPU: new_with_dims_and_tensor_sparse
-    SparseCUDA: new_with_dims_and_tensor_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: new_with_dims_and_tensor_sparse
 
 - func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: sparse_resize_
-    SparseCUDA: sparse_resize_
-  requires_tensor: True
+    SparseCPU, SparseCUDA: sparse_resize_
 
 - func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: sparse_resize_and_clear_
-    SparseCUDA: sparse_resize_and_clear_
-  requires_tensor: True
+    SparseCPU, SparseCUDA: sparse_resize_and_clear_
 
 - func: sparse_mask(Tensor self, Tensor mask) -> Tensor
   use_c10_dispatcher: full
@@ -3393,16 +4143,13 @@
   dispatch:
     SparseCPU: sparse_mask_cpu
     SparseCUDA: sparse_mask_cuda
-  requires_tensor: True
 
 - func: to_dense(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: sparse_to_dense
-    SparseCUDA: sparse_to_dense
+    SparseCPU, SparseCUDA: sparse_to_dense
     MkldnnCPU: mkldnn_to_dense
-  requires_tensor: True
 
 - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor
   use_c10_dispatcher: full
@@ -3411,9 +4158,7 @@
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: sparse_dim_sparse
-    SparseCUDA: sparse_dim_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: sparse_dim_sparse
   device_guard: False
 
 # legacy method
@@ -3421,18 +4166,14 @@
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: sparse_dim_sparse
-    SparseCUDA: sparse_dim_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: sparse_dim_sparse
   device_guard: False
 
 - func: dense_dim(Tensor self) -> int
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: dense_dim_sparse
-    SparseCUDA: dense_dim_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: dense_dim_sparse
   device_guard: False
 
 # legacy method
@@ -3440,18 +4181,14 @@
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: dense_dim_sparse
-    SparseCUDA: dense_dim_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: dense_dim_sparse
   device_guard: False
 
 - func: _nnz(Tensor self) -> int
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: _nnz_sparse
-    SparseCUDA: _nnz_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: _nnz_sparse
   device_guard: False
 
 - func: coalesce(Tensor self) -> Tensor
@@ -3460,83 +4197,68 @@
   dispatch:
     SparseCPU: coalesce_sparse_cpu
     SparseCUDA: coalesce_sparse_cuda
-  requires_tensor: True
 
 - func: is_coalesced(Tensor self) -> bool
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: is_coalesced_sparse
-    SparseCUDA: is_coalesced_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: is_coalesced_sparse
   device_guard: False
 
 - func: _indices(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: _indices_sparse
-    SparseCUDA: _indices_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: _indices_sparse
   device_guard: False
 
 - func: _values(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: _values_sparse
-    SparseCUDA: _values_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: _values_sparse
   device_guard: False
 
 # This method doesn't do any check but only directly sets the flag. So it can be
 # a bit unsafe. Similar to _indices and _values, this is useful for implementing
 # custom sparse operations in Python/C++ extension.
 - func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: _coalesced_sparse_
-    SparseCUDA: _coalesced_sparse_
-  requires_tensor: True
+    SparseCPU, SparseCUDA: _coalesced_sparse_
   device_guard: False
 
 - func: indices(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: indices_sparse
-    SparseCUDA: indices_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: indices_sparse
   device_guard: False
 
 - func: values(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    SparseCPU: values_sparse
-    SparseCUDA: values_sparse
-  requires_tensor: True
+    SparseCPU, SparseCUDA: values_sparse
   device_guard: False
 
 - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     SparseCPU: hspmm_out_sparse_cpu
     SparseCUDA: hspmm_out_sparse_cuda
-  requires_tensor: True
 
 - func: hspmm(Tensor mat1, Tensor mat2) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: hspmm_sparse_cpu
     SparseCUDA: hspmm_sparse_cuda
-  requires_tensor: True
 
 - func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: function
   dispatch:
-    SparseCPU: copy_sparse_
-    SparseCUDA: copy_sparse_
-  requires_tensor: True
+    SparseCPU, SparseCUDA: copy_sparse_
 
 - func: unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[]
   use_c10_dispatcher: full
@@ -3549,15 +4271,13 @@
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: dense_to_sparse
-    CUDA: dense_to_sparse
+    CPU, CUDA: dense_to_sparse
 
 - func: to_sparse(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: dense_to_sparse
-    CUDA: dense_to_sparse
+    CPU, CUDA: dense_to_sparse
 
 - func: to_mkldnn(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3572,21 +4292,30 @@
   dispatch:
     MkldnnCPU: mkldnn_reorder_conv2d_weight
 
+- func: mkldnn_reorder_conv3d_weight(Tensor self, int[3] padding=0, int[3] stride=1, int[3] dilation=1, int groups=1) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  python_module: nn
+  dispatch:
+    MkldnnCPU: mkldnn_reorder_conv3d_weight
+
 - func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor
   use_c10_dispatcher: full
 
 - func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
   variants: function
   dispatch:
-    CPU: quantize_per_tensor
-    CUDA: quantize_per_tensor
+    CPU, CUDA: quantize_per_tensor
 
 - func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[]
+  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: quantize_per_tensor_list_cpu
 
 - func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor
+  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: quantize_per_channel_cpu
@@ -3595,53 +4324,50 @@
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    QuantizedCPU: dequantize_quant
-    QuantizedCUDA: dequantize_quant
+    QuantizedCPU, QuantizedCUDA: dequantize_quant
 
 - func: dequantize.tensors(Tensor[] tensors) -> Tensor[]
   use_c10_dispatcher: full
   variants: function
   dispatch:
-    QuantizedCPU: dequantize_tensors_quant
+    QuantizedCPU: dequantize_tensors_quantized_cpu
 
 - func: q_scale(Tensor self) -> float
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    QuantizedCPU: q_scale_quant
-    QuantizedCUDA: q_scale_quant
+    QuantizedCPU, QuantizedCUDA: q_scale_quant
 
 - func: q_zero_point(Tensor self) -> int
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    QuantizedCPU: q_zero_point_quant
-    QuantizedCUDA: q_zero_point_quant
+    QuantizedCPU, QuantizedCUDA: q_zero_point_quant
 
 - func: q_per_channel_scales(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    QuantizedCPU: q_per_channel_scales_quant
+    QuantizedCPU, QuantizedCUDA: q_per_channel_scales
 
 - func: q_per_channel_zero_points(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    QuantizedCPU: q_per_channel_zero_points_quant
+    QuantizedCPU, QuantizedCUDA: q_per_channel_zero_points
 
 - func: q_per_channel_axis(Tensor self) -> int
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    QuantizedCPU: q_per_channel_axis_quant
+    QuantizedCPU, QuantizedCUDA: q_per_channel_axis
 
 - func: int_repr(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
   dispatch:
-    QuantizedCPU: int_repr_quant_cpu
-    QuantizedCUDA: int_repr_quant_cuda
+    QuantizedCPU: int_repr_quantized_cpu
+    QuantizedCUDA: int_repr_quantized_cuda
 
 - func: _make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor
   use_c10_dispatcher: full
@@ -3658,45 +4384,80 @@
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    QuantizedCPU: qscheme_quant
-    QuantizedCUDA: qscheme_quant
+    QuantizedCPU, QuantizedCUDA: qscheme_quant
 
 - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: fake_quantize_per_tensor_affine
 
 - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
 
+- func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_tensor_affine
+
+- func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
+  variants: function
+
 - func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: fake_quantize_per_channel_affine
 
 - func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
 
-- func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int)
+- func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_channel_affine
 
-# to(Device) must not exist because all constructors of Device also works for
-# TensorOptions. Otherwise, an ambiguity error is thrown.
-# See NOTE [ TensorOptions Constructors ].
-- func: to.dtype_layout(Tensor self, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
-  variants: method
-  device_guard: False
+- func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
+  variants: function
 
-- func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
-  variants: method
-  device_guard: False
+- func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int)
+  use_c10_dispatcher: full
+  variants: function
+
+- func: _saturate_weight_to_fp16(Tensor weight) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+
+- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (float, float)
+  use_c10_dispatcher: full
+  variants: function
+
+# to(Device) must not exist because all constructors of Device also works for
+# TensorOptions. Otherwise, an ambiguity error is thrown.
+# See NOTE [ TensorOptions Constructors ].
+- func: to.dtype_layout(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
+  variants: method
+  device_guard: False
+
+- func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
+  variants: method
+  device_guard: False
 
 - func: to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
@@ -3716,20 +4477,26 @@
   variants: method
 
 - func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType
+  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType
+  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType
+  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
+  use_c10_dispatcher: full
 
 - func: can_cast(ScalarType from, ScalarType to) -> bool
+  use_c10_dispatcher: full
   variants: function
 
 - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
+  use_c10_dispatcher: full
   variants: function
 
 # NB: Does NOT check precondition that numel == 1
@@ -3742,16 +4509,20 @@
 
 # Fused RNN kernels
 - func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: _thnn_fused_lstm_cell_cuda
 
 - func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: _thnn_fused_lstm_cell_backward_cuda
 
 - func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
 
 - func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
   dispatch:
     CUDA: _thnn_fused_gru_cell_cuda
 
@@ -3761,6 +4532,7 @@
     CUDA: _thnn_fused_gru_cell_backward_cuda
 
 - func: _thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  use_c10_dispatcher: full
 
 # RNN cells and layers
 - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
@@ -3788,19 +4560,25 @@
   use_c10_dispatcher: full
 
 - func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
 
 - func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+  use_c10_dispatcher: full
 
 - func: rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+  use_c10_dispatcher: full
 
 # Quantized RNN layer registration has been moved to C10 dispatch in `RNN.cpp`
 
 # Quantized RNN layers
 # - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
+#  use_c10_dispatcher: full
 
 # - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
+#  use_c10_dispatcher: full
 
 # Quantized GRU layers
 
@@ -3839,8 +4617,7 @@
   variants: method
   device_guard: False
   dispatch:
-    CPU: set_
-    CUDA: set_
+    CPU, CUDA: set_
 
 - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
   variants: method
@@ -3848,17 +4625,17 @@
   dispatch:
     CPU: set_storage_cpu_
     CUDA: set_storage_cuda_
-    QuantizedCPU: set_storage_quantized_
-    QuantizedCUDA: set_storage_quantized_
+    QuantizedCPU, QuantizedCUDA: set_storage_quantized_
 
 - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
-    CPU: set_tensor_
-    CUDA: set_tensor_
+    CPU, CUDA: set_tensor_
 
 - func: set_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: set_cpu_
@@ -3867,18 +4644,17 @@
 - func: set_quantizer_(Tensor(a!) self, ConstQuantizerPtr quantizer) -> Tensor(a!)
   variants: method
   dispatch:
-    QuantizedCPU: set_quantizer_
-    QuantizedCUDA: set_quantizer_
+    QuantizedCPU, QuantizedCUDA: set_quantizer_
 
 - func: is_set_to(Tensor self, Tensor tensor) -> bool
   use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
-    CPU: is_set_to
-    CUDA: is_set_to
+    CPU, CUDA: is_set_to
 
 - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: masked_fill__cpu
@@ -3889,6 +4665,7 @@
   variants: function, method
 
 - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: masked_fill__cpu
@@ -3899,6 +4676,7 @@
   variants: function, method
 
 - func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: masked_scatter__cpu
@@ -3913,19 +4691,18 @@
   variants: method
   device_guard: False
   dispatch:
-    CPU: view
-    CUDA: view
+    CPU, CUDA, QuantizedCPU, QuantizedCUDA: view
     MkldnnCPU: mkldnn_view
-    QuantizedCPU: view
-    QuantizedCUDA: view
 
 - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: legacy::cpu::_th_put_
     CUDA: legacy::cuda::_th_put_
 
 - func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: index_add_cpu_
@@ -3939,6 +4716,7 @@
   variants: function, method
 
 - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: legacy::cpu::_th_index_fill_
@@ -3949,10 +4727,10 @@
   variants: function, method
 
 - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: index_fill_
-    CUDA: index_fill_
+    CPU, CUDA: index_fill_
 
 - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
   use_c10_dispatcher: full
@@ -3971,20 +4749,20 @@
   variants: function, method
 
 - func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: scatter_
-    CUDA: scatter_
+    CPU, CUDA: scatter_
 
 - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: scatter_fill_
-    CUDA: scatter_fill_
+    CPU, CUDA: scatter_fill_
 
 - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
   use_c10_dispatcher: full
@@ -3996,11 +4774,23 @@
 - func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
   variants: function, method
 
+- func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+  dispatch:
+    CPU, CUDA: scatter_reduce_
+
+- func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+  dispatch:
+    CPU, CUDA: scatter_scalar_reduce_
+
 - func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: scatter_add_
-    CUDA: scatter_add_
+    CPU, CUDA: scatter_add_
 
 - func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
   use_c10_dispatcher: full
@@ -4009,53 +4799,23 @@
 - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
   variants: function, method
 
-- func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  variants: method
-
-- func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  variants: method
-
-- func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  variants: method
-
-- func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  variants: method
-
-- func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  variants: method
-
-- func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  variants: method
-
-- func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  variants: method
-
-- func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  variants: method
-
 - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  variants: method
-
-- func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  variants: method
-
-- func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
-    CPU: bitwise_and_out
-    CUDA: bitwise_and_out
+    CPU, CUDA: bitwise_and_out
 
 - func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
-    CPU: bitwise_and_out
-    CUDA: bitwise_and_out
+    CPU, CUDA: bitwise_and_out
 
 - func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
@@ -4066,9 +4826,11 @@
   variants: method, function
 
 - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: __and__.Scalar(Tensor self, Scalar other) -> Tensor
@@ -4080,22 +4842,22 @@
   variants: method, function
 
 - func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: __iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
-    CPU: bitwise_or_out
-    CUDA: bitwise_or_out
+    CPU, CUDA: bitwise_or_out
 
 - func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
-    CPU: bitwise_or_out
-    CUDA: bitwise_or_out
+    CPU, CUDA: bitwise_or_out
 
 - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
@@ -4106,9 +4868,11 @@
   variants: method, function
 
 - func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: __or__.Scalar(Tensor self, Scalar other) -> Tensor
@@ -4120,22 +4884,22 @@
   variants: method, function
 
 - func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: __ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
-    CPU: bitwise_xor_out
-    CUDA: bitwise_xor_out
+    CPU, CUDA: bitwise_xor_out
 
 - func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
-    CPU: bitwise_xor_out
-    CUDA: bitwise_xor_out
+    CPU, CUDA: bitwise_xor_out
 
 - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
@@ -4146,9 +4910,11 @@
   variants: method, function
 
 - func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor
@@ -4160,148 +4926,162 @@
   variants: method, function
 
 - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: __lshift__
-    CUDA: __lshift__
+    CPU, CUDA: __lshift__
 
 - func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: __lshift__
-    CUDA: __lshift__
+    CPU, CUDA: __lshift__
 
 - func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: __ilshift__
-    CUDA: __ilshift__
+    CPU, CUDA: __ilshift__
 
 - func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: __ilshift__
-    CUDA: __ilshift__
+    CPU, CUDA: __ilshift__
 
 - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: __rshift__
-    CUDA: __rshift__
+    CPU, CUDA: __rshift__
 
 - func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: __rshift__
-    CUDA: __rshift__
+    CPU, CUDA: __rshift__
 
 - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: __irshift__
-    CUDA: __irshift__
+    CPU, CUDA: __irshift__
 
 - func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: __irshift__
-    CUDA: __irshift__
+    CPU, CUDA: __irshift__
 
 - func: lgamma_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: _lgamma__cpu
     CUDA: _lgamma__cuda
 
 - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
+  dispatch:
+    CPU, CUDA: atan2_
 
 - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: tril_cpu_
     CUDA: tril_cuda_
 
 - func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: triu_cpu_
     CUDA: triu_cuda_
 
 - func: digamma_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
+  dispatch:
+    CPU, CUDA: digamma_
 
 - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: legacy::cpu::_th_renorm_
     CUDA: legacy::cuda::_th_renorm_
 
 - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: pow_
-    CUDA: pow_
+    CPU, CUDA: pow_
 
 - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: pow_
-    CUDA: pow_
+    CPU, CUDA: pow_
 
 - func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: lerp_cpu_scalar_
     CUDA: lerp_cuda_scalar_
 
 - func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: lerp_cpu_tensor_
     CUDA: lerp_cuda_tensor_
 
 - func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: fmod_
     CUDA: fmod_cuda_
 
 - func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: fmod_
     CUDA: fmod_cuda_
 
 - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: remainder_
-    CUDA: remainder_
+    CPU, CUDA: remainder_
 
 - func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: remainder_
-    CUDA: remainder_
+    CPU, CUDA: remainder_
 
 - func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: legacy::cpu::_th_addbmm_
+    CPU: addbmm_cpu_
     CUDA: addbmm__cuda
 
 - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -4317,31 +5097,48 @@
     CUDA: addbmm_cuda
 
 - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: uniform_
 
 - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: cauchy_
 
 - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: log_normal_
 
 - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: exponential_
 
 - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: geometric_
 
 # wrappers for TH functions
 
@@ -4354,11 +5151,20 @@
   use_c10_dispatcher: full
   variants: method, function
 
+- func: diag_backward(Tensor grad, int[] input_sizes, int diagonal) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
+
 - func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cross_out
 
 - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: cross
 
 - func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -4379,11 +5185,13 @@
   variants: method, function
 
 - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: tril_indices_cpu
     CUDA: tril_indices_cuda
 
 - func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  use_c10_dispatcher: full
   dispatch:
     CPU: triu_indices_cpu
     CUDA: triu_indices_cuda
@@ -4395,174 +5203,300 @@
     CPU: legacy::cpu::_th_trace
     CUDA: trace_cuda
 
+- func: trace_backward(Tensor grad, int[] sizes) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
+
 - func: ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: ne_out
-    CUDA: ne_out
+    CPU, CUDA: ne_out
     QuantizedCPU: ne_out_quantized_cpu
 
 - func: ne.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: ne
-    CUDA: ne
+    CPU, CUDA: ne
     QuantizedCPU: ne_quantized_cpu
 
 - func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: ne_out
-    CUDA: ne_out
+    CPU, CUDA: ne_out
     QuantizedCPU: ne_out_quantized_cpu
 
 - func: ne.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: ne
-    CUDA: ne
+    CPU, CUDA: ne
     QuantizedCPU: ne_quantized_cpu
 
+- func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+# not_equal, alias for torch.ne
+- func: not_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: not_equal.Scalar(Tensor self, Scalar other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: not_equal.Tensor(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
 - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: eq_out
-    CUDA: eq_out
+    CPU, CUDA: eq_out
     QuantizedCPU: eq_out_quantized_cpu
 
 - func: eq.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: eq
-    CUDA: eq
+    CPU, CUDA: eq
     QuantizedCPU: eq_quantized_cpu
 
 - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: eq_out
-    CUDA: eq_out
+    CPU, CUDA: eq_out
     QuantizedCPU: eq_out_quantized_cpu
 
 - func: eq.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: eq
-    CUDA: eq
+    CPU, CUDA: eq
     QuantizedCPU: eq_quantized_cpu
 
 - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: ge_out
-    CUDA: ge_out
+    CPU, CUDA: ge_out
     QuantizedCPU: ge_out_quantized_cpu
 
 - func: ge.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: ge
-    CUDA: ge
+    CPU, CUDA: ge
     QuantizedCPU: ge_quantized_cpu
 
 - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: ge_out
-    CUDA: ge_out
+    CPU, CUDA: ge_out
     QuantizedCPU: ge_out_quantized_cpu
 
 - func: ge.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: ge
-    CUDA: ge
+    CPU, CUDA: ge
     QuantizedCPU: ge_quantized_cpu
 
+- func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+# greater_equal, alias for torch.ge
+- func: greater_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater_equal.Scalar(Tensor self, Scalar other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater_equal.Tensor(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
 - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: le_out
-    CUDA: le_out
+    CPU, CUDA: le_out
     QuantizedCPU: le_out_quantized_cpu
 
 - func: le.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: le
-    CUDA: le
+    CPU, CUDA: le
     QuantizedCPU: le_quantized_cpu
 
 - func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: le_out
-    CUDA: le_out
+    CPU, CUDA: le_out
     QuantizedCPU: le_out_quantized_cpu
 
 - func: le.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: le
-    CUDA: le
+    CPU, CUDA: le
     QuantizedCPU: le_quantized_cpu
 
+- func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+# less_equal, alias for torch.le
+- func: less_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less_equal.Scalar(Tensor self, Scalar other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less_equal.Tensor(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
 - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: gt_out
-    CUDA: gt_out
+    CPU, CUDA: gt_out
     QuantizedCPU: gt_out_quantized_cpu
 
 - func: gt.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: gt
-    CUDA: gt
+    CPU, CUDA: gt
     QuantizedCPU: gt_quantized_cpu
 
 - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: gt_out
-    CUDA: gt_out
+    CPU, CUDA: gt_out
     QuantizedCPU: gt_out_quantized_cpu
 
 - func: gt.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: gt
-    CUDA: gt
+    CPU, CUDA: gt
     QuantizedCPU: gt_quantized_cpu
 
+- func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+#  greater, alias for torch.gt
+- func: greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater.Scalar(Tensor self, Scalar other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: greater.Tensor(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
 - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: lt_out
-    CUDA: lt_out
+    CPU, CUDA: lt_out
     QuantizedCPU: lt_out_quantized_cpu
 
 - func: lt.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: lt
-    CUDA: lt
+    CPU, CUDA: lt
     QuantizedCPU: lt_quantized_cpu
 
 - func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: lt_out
-    CUDA: lt_out
+    CPU, CUDA: lt_out
     QuantizedCPU: lt_out_quantized_cpu
 
 - func: lt.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: lt
-    CUDA: lt
+    CPU, CUDA: lt
     QuantizedCPU: lt_quantized_cpu
 
+- func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+#  less, alias for torch.lt
+- func: less.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less.Scalar(Tensor self, Scalar other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: less.Tensor(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
+- func: less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: method
+
 - func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: legacy::cpu::_th_take_out
@@ -4575,17 +5509,22 @@
     CPU: legacy::cpu::_th_take
     CUDA: legacy::cuda::_th_take
 
-- func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+- func: take_backward(Tensor grad, Tensor input, Tensor index) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
+
+- func: index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: index_select_out_cpu_
-    CUDA: legacy::cuda::_th_index_select_out
+    CUDA: index_select_out_cuda
 
 - func: index_select(Tensor self, int dim, Tensor index) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: index_select_cpu_
-    CUDA: legacy::cuda::_th_index_select
+    CUDA: index_select_cuda
     SparseCPU: index_select_sparse
     SparseCUDA: index_select_sparse
 
@@ -4594,6 +5533,11 @@
 - func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor
   variants: method, function
 
+- func: index_select_backward(Tensor grad, int[] self_sizes, int dim, Tensor index) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
+
 - func: masked_select.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: masked_select_out_cpu
@@ -4606,17 +5550,22 @@
     CPU: masked_select_cpu
     CUDA: masked_select_cuda
 
+- func: masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
+
 - func: nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: legacy::cpu::_th_nonzero_out
-    CUDA: legacy::cuda::_th_nonzero_out
+    CUDA: nonzero_out_cuda
 
 - func: nonzero(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_nonzero
-    CUDA: legacy::cuda::_th_nonzero
+    CUDA: nonzero_cuda
 
 - func: nonzero_numpy(Tensor self) -> Tensor[]
   use_c10_dispatcher: full
@@ -4631,8 +5580,12 @@
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: gather
-    CUDA: gather
+    CPU, CUDA: gather
+
+- func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  device_guard: False
 
 - func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
 
@@ -4643,15 +5596,20 @@
   use_c10_dispatcher: full
 
 - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addcmul_out
 
 - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
 - func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addcdiv_out
 
 - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   use_c10_dispatcher: full
@@ -4839,14 +5797,12 @@
 # TODO: remove dispatch section when porting TH CUDA to ATen
 - func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: multinomial_out
-    CUDA: multinomial_out
+    CPU, CUDA: multinomial_out
 
 - func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
   variants: method, function
   dispatch:
-    CPU: multinomial
-    CUDA: multinomial
+    CPU, CUDA: multinomial
 
 - func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor)
   use_c10_dispatcher: full
@@ -4870,16 +5826,21 @@
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: lgamma
-    CUDA: lgamma
+    CPU, CUDA: lgamma
 
 - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: digamma_out
 
 - func: digamma(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: digamma
 
 - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: polygamma_out
 
 - func: polygamma(int n, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -4889,10 +5850,10 @@
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: erfinv
-    CUDA: erfinv
+    CPU, CUDA: erfinv
 
 - func: erfinv_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: _erfinv__cpu
@@ -4903,27 +5864,52 @@
     CPU: _erfinv_out_cpu
     CUDA: _erfinv_out_cuda
 
+- func: i0(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: i0_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: i0_out
+
 - func: sign(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
 
 - func: sign_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   variants: method
 
 - func: sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: sign_out
-    CUDA: sign_out
+    CPU, CUDA: sign_out
+
+- func: signbit(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: signbit_out
+    CUDA: signbit_out
 
 - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
 - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: atan2_out
 
 - func: atan2(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: atan2
 
 - func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -4985,57 +5971,101 @@
     CPU: fmod
     CUDA: fmod_cuda
 
+- func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: hypot_out
+
+- func: hypot(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+  dispatch:
+    CPU, CUDA: hypot
+
+- func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
+- func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: nextafter_out
+
+- func: nextafter(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+  dispatch:
+    CPU, CUDA: nextafter
+
+- func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+  variants: method
+
 - func: remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: remainder_out
-    CUDA: remainder_out
+    CPU, CUDA: remainder_out
 
 - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: remainder
-    CUDA: remainder
+    CPU, CUDA: remainder
 
 - func: remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: remainder_out
-    CUDA: remainder_out
+    CPU, CUDA: remainder_out
 
 - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: remainder
-    CUDA: remainder
+    CPU, CUDA: remainder
 
-- func: min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+- func: min(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+  dispatch:
+    CPU, CUDA: min
+    QuantizedCPU: min_quantized_cpu
 
-- func: min.other(Tensor self, Tensor other) -> Tensor
+- func: max(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: max
+    QuantizedCPU: max_quantized_cpu
 
-- func: min(Tensor self) -> Tensor
+- func: maximum(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: min
-    CUDA: min
-    QuantizedCPU: min_quant
+    CPU, CUDA: maximum
 
-- func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+- func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: maximum_out
 
+# binary max, alias of maximum
+# NOTE: max is not an alias for maximum, since there is also unary max
 - func: max.other(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
-- func: max(Tensor self) -> Tensor
+- func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: minimum(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: max
-    CUDA: max
-    QuantizedCPU: max_quant
+    CPU, CUDA: minimum
+
+- func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: minimum_out
+
+# binary min, alias for minimum
+# NOTE: min is not an alias for minimum, since there is also unary min
+- func: min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: min.other(Tensor self, Tensor other) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
 
 - func: median(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -5044,18 +6074,42 @@
     CPU: median_cpu
     CUDA: median_cuda
 
+- func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
+- func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor
+  use_c10_dispatcher: full
+  variants: method, function
+
 - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   dispatch:
-    CPU: legacy::cpu::_th_sort_out
+    CPU: sort_out_cpu
     CUDA: legacy::cuda::_th_sort_out
 
 - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: legacy::cpu::_th_sort
+    CPU: sort_cpu
     CUDA: legacy::cuda::_th_sort
-    QuantizedCPU: sort_quant
+    QuantizedCPU: sort_quantized_cpu
 
 - func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
 
@@ -5069,7 +6123,7 @@
 - func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
   variants: method, function
 
-- func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) ->(Tensor(a!) values, Tensor(b!) indices)
+- func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   dispatch:
     CPU: topk_out_cpu
     CUDA: legacy::cuda::_th_topk_out
@@ -5078,22 +6132,21 @@
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: topk
-    CUDA: topk
-    QuantizedCPU: quantized_topk_cpu
+    CPU, CUDA: topk
+    QuantizedCPU: topk_quantized_cpu
 
 - func: all(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: all
 
 - func: any(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: any
-    CUDA: any
-    SparseCPU: any_sparse
-    SparseCUDA: any_sparse
+    CPU, CUDA: any
+    SparseCPU, SparseCUDA: any_sparse
 
 - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -5112,62 +6165,82 @@
   variants: method
   device_guard: False
   dispatch:
-    CPU: unfold
-    CUDA: unfold
-    QuantizedCPU: unfold
-    QuantizedCUDA: unfold
+    CPU, CUDA: unfold
+    QuantizedCPU, QuantizedCUDA: unfold
 
 - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor
+  use_c10_dispatcher: full
   variants: function
   dispatch:
-    CPU: unfold_backward
-    CUDA: unfold_backward
+    CPU, CUDA: unfold_backward
 
 - func: equal(Tensor self, Tensor other) -> bool
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: legacy::cpu::_th_equal
-    CUDA: legacy::cuda::_th_equal
-    QuantizedCPU: quantized_equal_cpu
+    CPU: cpu_equal
+    CUDA: cuda_equal
+    QuantizedCPU: equal_quantized_cpu
 
 - func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: pow_out
-    CUDA: pow_out
+    CPU, CUDA: pow_out
 
 - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: pow
-    CUDA: pow
+    CPU, CUDA: pow
 
 - func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: pow_out
-    CUDA: pow_out
+    CPU, CUDA: pow_out
 
 - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
   use_c10_dispatcher: full
   dispatch:
-    CPU: pow
-    CUDA: pow
+    CPU, CUDA: pow
+
+- func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: pow_out
+    SparseCPU, SparseCUDA: pow_out_sparse_scalar
+
+- func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+  dispatch:
+    CPU, CUDA: pow
+    SparseCPU, SparseCUDA: pow_sparse_scalar
 
 - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: normal_
 
 - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
 
 - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
 
 - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
 
 - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
 
 - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
 
 - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
 
 - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -5177,23 +6250,8 @@
   use_c10_dispatcher: full
   variants: method, function
 
-- func: _addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
-  dispatch:
-    CPU: legacy::cpu::_th_addr
-    CUDA: addr_cuda
-
-- func: _addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  dispatch:
-    CPU: legacy::cpu::_th_addr_
-    CUDA: addr__cuda
-
-- func: _addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU: legacy::cpu::_th_addr_out
-    CUDA: addr_out_cuda
-
 - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
+  use_c10_dispatcher: full
   dispatch:
     CPU: legacy::cpu::_th_index_copy_
     CUDA: legacy::cuda::_th_index_copy_
@@ -5215,33 +6273,301 @@
     CPU: _cumprod_cpu
     CUDA: _cumprod_cuda
 
-- func: _cumprod.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+- func: _cumprod.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: _cumprod_out_cpu
+    CUDA: _cumprod_out_cuda
+
+- func: _var(Tensor self, bool unbiased=True) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    CPU: legacy::cpu::_th_var
+
+- func: _std(Tensor self, bool unbiased=True) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    CPU: legacy::cpu::_th_std
+
+- func: _amp_non_finite_check_and_unscale_(Tensor(a!) self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    CUDA: _amp_non_finite_check_and_unscale_cuda_
+
+- func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    CUDA: _amp_update_scale_cuda
+
+- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    CPU: _cat_cpu
+    CUDA: cat_cuda
+    QuantizedCPU: cat_quantized_cpu
+
+- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU: _cat_out_cpu
+    CUDA: cat_out_cuda
+    QuantizedCPU: cat_out_quantized_cpu
+
+- func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalar_kernel_slow
+    CUDA: foreach_tensor_add_scalar_kernel_cuda
+
+- func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalar_kernel_slow_
+    CUDA: foreach_tensor_add_scalar_kernel_cuda_
+
+- func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalar_kernel_slow
+    CUDA: foreach_tensor_sub_scalar_kernel_cuda
+
+- func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalar_kernel_slow_
+    CUDA: foreach_tensor_sub_scalar_kernel_cuda_
+
+- func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalar_kernel_slow
+    CUDA: foreach_tensor_mul_scalar_kernel_cuda
+
+- func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalar_kernel_slow_
+    CUDA: foreach_tensor_mul_scalar_kernel_cuda_
+
+- func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalar_kernel_slow
+    CUDA: foreach_tensor_div_scalar_kernel_cuda
+
+- func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalar_kernel_slow_
+    CUDA: foreach_tensor_div_scalar_kernel_cuda_
+
+- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_list_kernel_slow
+    CUDA: foreach_tensor_add_list_kernel_cuda
+
+- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_list_kernel_slow_
+    CUDA: foreach_tensor_add_list_kernel_cuda_
+
+- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_list_kernel_slow
+    CUDA: foreach_tensor_sub_list_kernel_cuda
+
+- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_list_kernel_slow_
+    CUDA: foreach_tensor_sub_list_kernel_cuda_
+
+- func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_list_kernel_slow
+    CUDA: foreach_tensor_mul_list_kernel_cuda
+
+- func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_list_kernel_slow_
+    CUDA: foreach_tensor_mul_list_kernel_cuda_
+
+- func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_list_kernel_slow
+    CUDA: foreach_tensor_div_list_kernel_cuda
+
+- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_list_kernel_slow_
+    CUDA: foreach_tensor_div_list_kernel_cuda_
+
+- func: _foreach_add_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda
+
+- func: _foreach_add_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
+
+- func: _foreach_sub_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
+
+- func: _foreach_sub_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
+
+- func: _foreach_div_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda
+
+- func: _foreach_div_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
+
+- func: _foreach_mul_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
+
+- func: _foreach_mul_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
+
+- func: _foreach_exp(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_exp_slow
+    CUDA: foreach_tensor_exp_cuda
+
+- func: _foreach_exp_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_exp_slow_
+    CUDA: foreach_tensor_exp_cuda_
+
+- func: _foreach_sqrt(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sqrt_slow
+    CUDA: foreach_tensor_sqrt_cuda
+
+- func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
   dispatch:
-    CPU: _cumprod_out_cpu
-    CUDA: _cumprod_out_cuda
+    CPU: foreach_tensor_sqrt_slow_
+    CUDA: foreach_tensor_sqrt_cuda_
 
-- func: _amp_non_finite_check_and_unscale_(Tensor(a!) self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
+- func: _foreach_addcdiv_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
   variants: function
   dispatch:
-    CUDA: _amp_non_finite_check_and_unscale_cuda_
+    CPU: foreach_tensor_addcdiv_slow_
+    CUDA: foreach_tensor_addcdiv_cuda_
 
-- func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor
+- func: _foreach_addcmul_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
   variants: function
   dispatch:
-    CUDA: _amp_update_scale_cuda
+    CPU: foreach_tensor_addcmul_slow_
+    CUDA: foreach_tensor_addcmul_cuda_
 
-- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
+- func: _foreach_addcdiv(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
   use_c10_dispatcher: full
+  device_guard: False
+  variants: function
   dispatch:
-    CPU: _cat_cpu
-    CUDA: cat_cuda
-    QuantizedCPU: quantized_cat
+    CPU: foreach_tensor_addcdiv_slow
+    CUDA: foreach_tensor_addcdiv_cuda
 
-- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+- func: _foreach_addcmul(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
   dispatch:
-    CPU: _cat_out_cpu
-    CUDA: cat_out_cuda
-    QuantizedCPU: quantized_cat_out
+    CPU: foreach_tensor_addcmul_slow
+    CUDA: foreach_tensor_addcmul_cuda
 
 - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
@@ -5292,23 +6618,25 @@
 
 - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss_out
 
 - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss
 
 - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU: mse_loss_backward_out
-    CUDA: mse_loss_backward_out
+    CPU, CUDA: mse_loss_backward_out
 
 - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
   dispatch:
-    CPU: mse_loss_backward
-    CUDA: mse_loss_backward
+    CPU, CUDA: mse_loss_backward
 
 - func: l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -5320,8 +6648,7 @@
 - func: l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU: l1_loss_backward_out
-    CUDA: l1_loss_backward_out
+    CPU, CUDA: l1_loss_backward_out
 
 - func: l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
   use_c10_dispatcher: full
@@ -5334,6 +6661,7 @@
     CUDA: legacy::cuda::_thnn_multi_margin_loss_forward_out
 
 - func: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: multi_margin_loss_cpu
@@ -5346,6 +6674,7 @@
     CUDA: legacy::cuda::_thnn_multi_margin_loss_backward_out
 
 - func: multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: multi_margin_loss_cpu_backward
@@ -5388,6 +6717,7 @@
   python_module: nn
 
 - func: nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
 
 - func: nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
@@ -5397,6 +6727,7 @@
     CUDA: legacy::cuda::_thnn_nll_loss_forward_out
 
 - func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: nll_loss_forward_cpu
@@ -5409,6 +6740,7 @@
     CUDA: legacy::cuda::_thnn_nll_loss_backward_out
 
 - func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: nll_loss_backward_cpu
@@ -5418,6 +6750,7 @@
   python_module: nn
 
 - func: nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
 
 - func: nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
@@ -5427,6 +6760,7 @@
     CUDA: legacy::cuda::_thnn_nll_loss2d_forward_out
 
 - func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: nll_loss2d_forward_cpu
@@ -5439,28 +6773,31 @@
     CUDA: legacy::cuda::_thnn_nll_loss2d_backward_out
 
 - func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: nll_loss2d_backward_cpu
     CUDA: legacy::cuda::_thnn_nll_loss2d_backward
 
-- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_out
     CUDA: smooth_l1_loss_out
 
-- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: smooth_l1_loss
 
-- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_backward_out
     CUDA: smooth_l1_loss_backward_out
 
-- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
 
@@ -5480,22 +6817,28 @@
 
 - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu_out
 
 - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu
 
 - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU: elu_backward_out
-    CUDA: elu_backward_out
+    CPU, CUDA: elu_backward_out
 
 - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu_backward
 
 - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   python_module: nn
 
 - func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!)
@@ -5526,100 +6869,107 @@
 
 - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_out
 
 - func: hardsigmoid(Tensor self) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
   dispatch:
-    CPU: hardsigmoid
-    CUDA: hardsigmoid
-    QuantizedCPU: quantized_hardsigmoid
+    CPU, CUDA: hardsigmoid
+    QuantizedCPU: hardsigmoid_quantized_cpu
 
 - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_
 
 - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
   dispatch:
-    CPU: hardsigmoid_backward
-    CUDA: hardsigmoid_backward
+    CPU, CUDA: hardsigmoid_backward
 
 - func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU: hardtanh_out
-    CUDA: hardtanh_out
-    QuantizedCPU: quantized_hardtanh_out
+    CPU, CUDA: hardtanh_out
+    QuantizedCPU: hardtanh_out_quantized_cpu
 
 - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
   dispatch:
-    CPU: hardtanh
-    CUDA: hardtanh
-    QuantizedCPU: quantized_hardtanh
+    CPU, CUDA: hardtanh
+    QuantizedCPU: hardtanh_quantized_cpu
 
 - func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU: hardtanh_backward_out
-    CUDA: hardtanh_backward_out
+    CPU, CUDA: hardtanh_backward_out
 
 - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardtanh_backward
 
 - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
-    CPU: hardtanh_
-    CUDA: hardtanh_
-    QuantizedCPU: quantized_hardtanh_
-    Vulkan: vulkan_hardtanh_
+    CPU, CUDA: hardtanh_
+    QuantizedCPU: hardtanh_quantized_cpu_
 
 - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_out
 
 - func: hardswish(Tensor self) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish
 
 - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
+  use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_
 
 - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
   dispatch:
-    CPU: hardswish_backward
-    CUDA: hardswish_backward
+    CPU, CUDA: hardswish_backward
 
 - func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU: leaky_relu_out
-    CUDA: leaky_relu_out
-    QuantizedCPU: quantized_leaky_relu_out
+    CPU, CUDA: leaky_relu_out
+    QuantizedCPU: leaky_relu_out_quantized_cpu
 
 - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
   dispatch:
-    CPU: leaky_relu
-    CUDA: leaky_relu
-    QuantizedCPU: quantized_leaky_relu
+    CPU, CUDA: leaky_relu
+    QuantizedCPU: heaky_relu_quantized_cpu
 
 - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: leaky_relu_backward
 
 - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
-    CPU: leaky_relu_
-    CUDA: leaky_relu_
-    QuantizedCPU: quantized_leaky_relu_
+    CPU, CUDA: leaky_relu_
+    QuantizedCPU: leaky_relu_quantized_cpu_
 
 - func: log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -5678,43 +7028,52 @@
 
 - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_out
 
 - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus
 
 - func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU: softplus_backward_out
-    CUDA: softplus_backward_out
+    CPU, CUDA: softplus_backward_out
 
 - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_backward
 
 - func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_out
 
 - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink
 
 - func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU: softshrink_backward_out
-    CUDA: softshrink_backward_out
+    CPU, CUDA: softshrink_backward_out
 
 - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_backward
 
 - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU: adaptive_avg_pool2d_out_cpu
-    CUDA: adaptive_avg_pool2d_out_cuda
+    CPU, CUDA: adaptive_avg_pool2d_out_cpu
     MkldnnCPU: mkldnn_adaptive_avg_pool2d_out
 
 - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
@@ -5725,14 +7084,13 @@
   use_c10_dispatcher: full
   dispatch:
     MkldnnCPU: mkldnn_adaptive_avg_pool2d
-  requires_tensor: True
 
 - func: _adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     CPU: adaptive_avg_pool2d_cpu
     CUDA: adaptive_avg_pool2d_cuda
-    QuantizedCPU: quantized_adaptive_avg_pool2d
+    QuantizedCPU: adaptive_avg_pool2d_quantized_cpu
 
 - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -5746,6 +7104,7 @@
   dispatch:
     CPU: adaptive_avg_pool3d_out_cpu
     CUDA: adaptive_avg_pool3d_out_cuda
+    QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu
 
 - func: adaptive_avg_pool3d(Tensor self, int[3] output_size) -> Tensor
   use_c10_dispatcher: full
@@ -5753,6 +7112,7 @@
   dispatch:
     CPU: adaptive_avg_pool3d_cpu
     CUDA: adaptive_avg_pool3d_cuda
+    QuantizedCPU: adaptive_avg_pool3d_quantized_cpu
 
 - func: adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -5837,7 +7197,7 @@
     CPU: avg_pool2d_cpu
     CUDA: avg_pool2d_cuda
     MkldnnCPU: mkldnn_avg_pool2d
-    QuantizedCPU: quantized_avg_pool2d
+    QuantizedCPU: avg_pool2d_quantized_cpu
 
 - func: avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -5857,6 +7217,7 @@
   dispatch:
     CPU: avg_pool3d_out_cpu
     CUDA: avg_pool3d_out_cuda
+    MkldnnCPU: mkldnn_avg_pool3d_out
 
 - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
   use_c10_dispatcher: full
@@ -5864,7 +7225,8 @@
   dispatch:
     CPU: avg_pool3d_cpu
     CUDA: avg_pool3d_cuda
-    QuantizedCPU: quantized_avg_pool3d
+    MkldnnCPU: mkldnn_avg_pool3d
+    QuantizedCPU: avg_pool3d_quantized_cpu
 
 - func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6174,6 +7536,108 @@
     CPU: replication_pad3d_backward_cpu
     CUDA: replication_pad3d_backward_cuda
 
+- func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_linear1d_cpu
+    CUDA: upsample_linear1d_cuda
+
+- func: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_linear1d_backward_cpu
+    CUDA: upsample_linear1d_backward_cuda
+
+- func: upsample_bilinear2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_bilinear2d_cpu
+    CUDA: upsample_bilinear2d_cuda
+    QuantizedCPU: upsample_bilinear2d_quantized_cpu
+
+- func: upsample_bilinear2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_bilinear2d_backward_cpu
+    CUDA: upsample_bilinear2d_backward_cuda
+
+- func: upsample_trilinear3d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_trilinear3d_cpu
+    CUDA: upsample_trilinear3d_cuda
+
+- func: upsample_trilinear3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_trilinear3d_backward_cpu
+    CUDA: upsample_trilinear3d_backward_cuda
+
+- func: upsample_bicubic2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_bicubic2d_cpu
+    CUDA: upsample_bicubic2d_cuda
+
+- func: upsample_bicubic2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_bicubic2d_backward_cpu
+    CUDA: upsample_bicubic2d_backward_cuda
+
+- func: upsample_nearest1d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_nearest1d_cpu
+    CUDA: upsample_nearest1d_cuda
+
+- func: upsample_nearest1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_nearest1d_backward_cpu
+    CUDA: upsample_nearest1d_backward_cuda
+
+- func: upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_nearest2d_cpu
+    CUDA: upsample_nearest2d_cuda
+    QuantizedCPU: upsample_nearest2d_quantized_cpu
+
+- func: upsample_nearest2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_nearest2d_backward_cpu
+    CUDA: upsample_nearest2d_backward_cuda
+
+- func: upsample_nearest3d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_nearest3d_cpu
+    CUDA: upsample_nearest3d_cuda
+    QuantizedCPU: upsample_nearest3d_quantized_cpu
+
+- func: upsample_nearest3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: upsample_nearest3d_backward_cpu
+    CUDA: upsample_nearest3d_backward_cuda
+
+# NOTE: all of the non-"vec" upsample overloads are only kept for backward compatibility.
 - func: upsample_linear1d.out(Tensor self, int[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
@@ -6212,7 +7676,7 @@
   dispatch:
     CPU: upsample_bilinear2d_cpu
     CUDA: upsample_bilinear2d_cuda
-    QuantizedCPU: quantized_upsample_bilinear2d_cpu
+    QuantizedCPU: upsample_bilinear2d_quantized_cpu
 
 - func: upsample_bilinear2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6317,8 +7781,7 @@
   dispatch:
     CPU: upsample_nearest2d_cpu
     CUDA: upsample_nearest2d_cuda
-    QuantizedCPU: quantized_upsample_nearest2d_cpu
-    Vulkan: upsample_nearest2d_vulkan
+    QuantizedCPU: upsample_nearest2d_quantized_cpu
 
 - func: upsample_nearest2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6345,7 +7808,7 @@
   dispatch:
     CPU: upsample_nearest3d_cpu
     CUDA: upsample_nearest3d_cuda
-    QuantizedCPU: quantized_upsample_nearest3d_cpu
+    QuantizedCPU: upsample_nearest3d_quantized_cpu
 
 - func: upsample_nearest3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6363,22 +7826,35 @@
 - func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU: sigmoid_backward_out
-    CUDA: sigmoid_backward_out
+    CPU, CUDA: sigmoid_backward_out
 
 - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: sigmoid_backward
+
+- func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+  python_module: nn
+  dispatch:
+    CPU, CUDA: logit_backward_out
+
+- func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU, CUDA: logit_backward
 
 - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU: tanh_backward_out
-    CUDA: tanh_backward_out
+    CPU, CUDA: tanh_backward_out
 
 - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: tanh_backward
 
 # What's a thnn_conv_ versus a slow_conv_?
 #
@@ -6405,6 +7881,7 @@
     CUDA: slow_conv_transpose2d_out_cuda
 
 - func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose2d_cpu
@@ -6430,6 +7907,7 @@
     CUDA: slow_conv_transpose3d_out_cuda
 
 - func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_cpu
@@ -6452,6 +7930,7 @@
   python_module: nn
 
 - func: thnn_conv2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
 
 - func: thnn_conv2d_forward.output(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, *, Tensor(a!) output, Tensor(b!) finput, Tensor(c!) fgrad_input) -> (Tensor(a!), Tensor(b!), Tensor(c!))
@@ -6461,6 +7940,7 @@
     CUDA: legacy::cuda::_thnn_conv2d_forward_out
 
 - func: thnn_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> (Tensor output, Tensor finput, Tensor fgrad_input)
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv2d_forward_cpu
@@ -6483,6 +7963,7 @@
   python_module: nn
 
 - func: thnn_conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
 
 - func: thnn_conv_depthwise2d_forward.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
@@ -6491,6 +7972,7 @@
     CUDA: legacy::cuda::_thnn_conv_depthwise2d_forward_out
 
 - func: thnn_conv_depthwise2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CUDA: legacy::cuda::_thnn_conv_depthwise2d_forward
@@ -6510,6 +7992,7 @@
   python_module: nn
 
 - func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
 
 - func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, *, Tensor(a!) output, Tensor(b!) finput, Tensor(c!) fgrad_input) -> (Tensor(a!), Tensor(b!), Tensor(c!))
@@ -6518,6 +8001,7 @@
     CPU: slow_conv3d_forward_out_cpu
 
 - func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding) -> (Tensor output, Tensor finput, Tensor fgrad_input)
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv3d_forward_cpu
@@ -6534,6 +8018,7 @@
     CPU: slow_conv3d_backward_cpu
 
 - func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated2d_cpu
@@ -6547,6 +8032,7 @@
     CUDA: slow_conv_dilated2d_backward_cuda
 
 - func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
+  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated3d_cpu
@@ -6621,7 +8107,170 @@
   variants: function, method
   device_guard: False
 
-# Note: this function is only for testing.
+- func: isposinf(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: isposinf_out
+
+- func: isneginf(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: isneginf_out
+
+# NOTE [_add_batch_dim and _remove_batch_dim]
+# _add_batch_dim and _remove_batch_dim are meant to be used in the implementation
+# of the vmap frontend API (see torch/_vmap_internals.py). They are not
+# user-facing, hence the leading underscore. Please don't use them them anywhere else.
+- func: _add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+
+# See NOTE [_add_batch_dim and _remove_batch_dim]
+- func: _remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor
+  use_c10_dispatcher: full
+  variants: function
+
+## Functions related to the fast Fourier transform and the torch.fft namespace
+# Note [FFT namespace binding]
+# Functions in the fft python module should have their names start with
+#   "fft_" underscore and be bound to the desired Python name in
+#   torch/fft/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/fft.h.
+#   The "fft_" names should be hidden from the user and not documented.
+#
+# See fft_fft as an example.
+
+# torch.fft.fft
+# NOTE: NOT an alias for torch.fft, which has different semantics
+- func: fft_fft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+## Functions for linear algebra and the torch.linalg namespace
+# Note [linalg namespace binding]
+# Functions in the linalg python module should have their names start with
+#   "linalg_" and be bound to the desired Python name in
+#   torch/linalg/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/linalg.h.
+#   The "linalg_" names should be hidden from the user and not documented.
+#
+# See linalg_det as an example.
+
+# torch.linalg.det, alias for torch.det
+- func: linalg_det(Tensor self) -> Tensor
+  python_module: linalg
+  use_c10_dispatcher: full
+  variants: function
+
+- func: det(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+# torch.outer, alias for torch.ger
+- func: outer(Tensor self, Tensor vec2) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: ger(Tensor self, Tensor vec2) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_norm.out(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+- func: linalg_norm.ord_str_out(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+
+## Functions that are only for testing
 # It is undocumented and should not be used outside of tests.
 - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
+
+# Note: this function is only for testing.
+- func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: _test_optional_intlist
+
+# Note: this function is only for testing.
+- func: _test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: _test_optional_intlist
+
+# Note: this function is only for testing.
+- func: _test_optional_floatlist(Tensor values, float[]? addends) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  dispatch:
+    CPU: _test_optional_floatlist
diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp
index 08deed30..46183dd3 100644
--- a/ext/torch/ext.cpp
+++ b/ext/torch/ext.cpp
@@ -372,7 +372,7 @@ void Init_ext()
     .define_method(
       "grad=",
       *[](Tensor& self, torch::Tensor& grad) {
-        self.grad() = grad;
+        self.mutable_grad() = grad;
       })
     .define_method(
       "_dtype",
@@ -609,7 +609,7 @@ void Init_ext()
     .define_method(
       "grad=",
       *[](Parameter& self, torch::Tensor& grad) {
-        self.grad() = grad;
+        self.mutable_grad() = grad;
       });
 
   Class rb_cDevice = define_class_under<torch::Device>(rb_mTorch, "Device")
diff --git a/ext/torch/ruby_arg_parser.h b/ext/torch/ruby_arg_parser.h
index b86825ff..aa9f2df4 100644
--- a/ext/torch/ruby_arg_parser.h
+++ b/ext/torch/ruby_arg_parser.h
@@ -91,7 +91,7 @@ struct RubyArgs {
   inline c10::optional<int64_t> toInt64Optional(int i);
   inline c10::optional<bool> toBoolOptional(int i);
   inline c10::optional<double> toDoubleOptional(int i);
-  // inline c10::OptionalArray<double> doublelistOptional(int i);
+  inline c10::OptionalArray<double> doublelistOptional(int i);
   // inline at::Layout layout(int i);
   // inline at::Layout layoutWithDefault(int i, at::Layout default_layout);
   inline c10::optional<at::Layout> layoutOptional(int i);
@@ -105,7 +105,7 @@ struct RubyArgs {
   inline c10::optional<at::MemoryFormat> memoryformatOptional(int i);
   // inline at::QScheme toQScheme(int i);
   inline std::string string(int i);
-  // inline c10::optional<std::string> stringOptional(int i);
+  inline c10::optional<std::string> stringOptional(int i);
   // inline PyObject* pyobject(int i);
   inline int64_t toInt64(int i);
   // inline int64_t toInt64WithDefault(int i, int64_t default_int);
@@ -249,6 +249,25 @@ inline c10::optional<double> RubyArgs::toDoubleOptional(int i) {
   return toDouble(i);
 }
 
+inline c10::OptionalArray<double> RubyArgs::doublelistOptional(int i) {
+  if (NIL_P(args[i])) return {};
+
+  VALUE arg = args[i];
+  auto size = RARRAY_LEN(arg);
+  std::vector<double> res(size);
+  for (idx = 0; idx < size; idx++) {
+    VALUE obj = rb_ary_entry(arg, idx);
+    if (FIXNUM_P(obj) || RB_FLOAT_TYPE_P(obj)) {
+      res[idx] = from_ruby<double>(obj);
+    } else {
+      rb_raise(rb_eArgError, "%s(): argument '%s' must be %s, but found element of type %s at pos %d",
+          signature.name.c_str(), signature.params[i].name.c_str(),
+          signature.params[i].type_name().c_str(), rb_obj_classname(obj), idx + 1);
+    }
+  }
+  return res;
+}
+
 inline c10::optional<at::Layout> RubyArgs::layoutOptional(int i) {
   if (NIL_P(args[i])) return c10::nullopt;
 
@@ -285,6 +304,11 @@ inline std::string RubyArgs::string(int i) {
   return from_ruby<std::string>(args[i]);
 }
 
+inline c10::optional<std::string> RubyArgs::stringOptional(int i) {
+  if (!args[i]) return c10::nullopt;
+  return from_ruby<std::string>(args[i]);
+}
+
 inline int64_t RubyArgs::toInt64(int i) {
   if (NIL_P(args[i])) return signature.params[i].default_int;
   return from_ruby<int64_t>(args[i]);
diff --git a/ext/torch/templates.h b/ext/torch/templates.h
index 40a68c6b..176302c9 100644
--- a/ext/torch/templates.h
+++ b/ext/torch/templates.h
@@ -19,6 +19,7 @@ using torch::TensorOptions;
 using torch::Layout;
 using torch::MemoryFormat;
 using torch::IntArrayRef;
+using torch::ArrayRef;
 using torch::TensorList;
 using torch::Storage;
 
diff --git a/ext/torch/wrap_outputs.h b/ext/torch/wrap_outputs.h
index 914b2688..97ab209e 100644
--- a/ext/torch/wrap_outputs.h
+++ b/ext/torch/wrap_outputs.h
@@ -90,3 +90,10 @@ inline Object wrap(torch::TensorList x) {
   }
   return Object(a);
 }
+
+inline Object wrap(std::tuple<double, double> x) {
+  Array a;
+  a.push(to_ruby<double>(std::get<0>(x)));
+  a.push(to_ruby<double>(std::get<1>(x)));
+  return Object(a);
+}

From 30d02fe12c5833340e862e446b5ae74218984e27 Mon Sep 17 00:00:00 2001
From: Andrew Kane <andrew@chartkick.com>
Date: Tue, 27 Oct 2020 16:25:29 -0700
Subject: [PATCH 02/28] Added test for max change in 1.7.0 [skip ci]

---
 test/autograd_test.rb | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/autograd_test.rb b/test/autograd_test.rb
index 79d64e39..1ad40314 100644
--- a/test/autograd_test.rb
+++ b/test/autograd_test.rb
@@ -80,4 +80,12 @@ def test_variable_invalid
     end
     assert_equal "Variable data has to be a tensor, but got Object", error.message
   end
+
+  # 1.7.0 behavior
+  def test_max
+    a = Torch.tensor([3.0, 2, 3], requires_grad: true)
+    a.max.backward
+    # TODO debug
+    # assert_equal [0.5, 0, 0.5], a.grad.to_a
+  end
 end

From d348c9302fbfe2641cdd9828c27c200be24bd902 Mon Sep 17 00:00:00 2001
From: Andrew Kane <andrew@chartkick.com>
Date: Tue, 27 Oct 2020 17:31:04 -0700
Subject: [PATCH 03/28] Updated changelog [skip ci]

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5524cdcb..8e1b7273 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.5.0 (unreleased)
+
+- Updated LibTorch to 1.7.0
+
 ## 0.4.2 (2020-10-27)
 
 - Fixed errors with optimizer options

From 47841b1bf92fd24e25def1a2bf845f9c9e451cae Mon Sep 17 00:00:00 2001
From: Andrew Kane <andrew@chartkick.com>
Date: Tue, 27 Oct 2020 17:43:12 -0700
Subject: [PATCH 04/28] Removed deprecated overload for addcmul! and addcdiv!

---
 CHANGELOG.md                |  1 +
 ext/torch/ext.cpp           | 10 ----------
 lib/torch/optim/adadelta.rb |  4 ++--
 lib/torch/optim/adagrad.rb  |  4 ++--
 lib/torch/optim/adam.rb     |  4 ++--
 lib/torch/optim/adamax.rb   |  2 +-
 lib/torch/optim/adamw.rb    |  4 ++--
 lib/torch/optim/rmsprop.rb  |  6 +++---
 lib/torch/optim/rprop.rb    |  2 +-
 9 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8e1b7273..9d59630e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 ## 0.5.0 (unreleased)
 
 - Updated LibTorch to 1.7.0
+- Removed deprecated overload for `addcmul!` and `addcdiv!`
 
 ## 0.4.2 (2020-10-27)
 
diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp
index 46183dd3..78097260 100644
--- a/ext/torch/ext.cpp
+++ b/ext/torch/ext.cpp
@@ -348,16 +348,6 @@ void Init_ext()
       *[](Tensor& self) {
         return self.is_contiguous();
       })
-    .define_method(
-      "addcmul!",
-      *[](Tensor& self, Scalar value, const Tensor & tensor1, const Tensor & tensor2) {
-        return self.addcmul_(tensor1, tensor2, value);
-      })
-    .define_method(
-      "addcdiv!",
-      *[](Tensor& self, Scalar value, const Tensor & tensor1, const Tensor & tensor2) {
-        return self.addcdiv_(tensor1, tensor2, value);
-      })
     .define_method(
       "_requires_grad!",
       *[](Tensor& self, bool requires_grad) {
diff --git a/lib/torch/optim/adadelta.rb b/lib/torch/optim/adadelta.rb
index 3b496d8a..af06786d 100644
--- a/lib/torch/optim/adadelta.rb
+++ b/lib/torch/optim/adadelta.rb
@@ -42,11 +42,11 @@ def step(closure = nil)
               grad = grad.add(p.data, alpha: group[:weight_decay])
             end
 
-            square_avg.mul!(rho).addcmul!(1 - rho, grad, grad)
+            square_avg.mul!(rho).addcmul!(grad, grad, value: 1 - rho)
             std = square_avg.add(eps).sqrt!
             delta = acc_delta.add(eps).sqrt!.div!(std).mul!(grad)
             p.data.add!(delta, alpha: -group[:lr])
-            acc_delta.mul!(rho).addcmul!(1 - rho, delta, delta)
+            acc_delta.mul!(rho).addcmul!(delta, delta, value: 1 - rho)
           end
         end
 
diff --git a/lib/torch/optim/adagrad.rb b/lib/torch/optim/adagrad.rb
index d8322c5d..1e17e10f 100644
--- a/lib/torch/optim/adagrad.rb
+++ b/lib/torch/optim/adagrad.rb
@@ -57,9 +57,9 @@ def step(closure = nil)
             if grad.sparse?
               raise NotImplementedYet
             else
-              state[:sum].addcmul!(1, grad, grad)
+              state[:sum].addcmul!(grad, grad, value: 1)
               std = state[:sum].sqrt.add!(group[:eps])
-              p.data.addcdiv!(-clr, grad, std)
+              p.data.addcdiv!(grad, std, value: -clr)
             end
           end
         end
diff --git a/lib/torch/optim/adam.rb b/lib/torch/optim/adam.rb
index 3a110ba9..1ce128a3 100644
--- a/lib/torch/optim/adam.rb
+++ b/lib/torch/optim/adam.rb
@@ -58,7 +58,7 @@ def step(closure = nil)
 
             # Decay the first and second moment running average coefficient
             exp_avg.mul!(beta1).add!(grad, alpha: 1 - beta1)
-            exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad)
+            exp_avg_sq.mul!(beta2).addcmul!(grad, grad, value: 1 - beta2)
             if amsgrad
               # Maintains the maximum of all 2nd moment running avg. till now
               Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq)
@@ -70,7 +70,7 @@ def step(closure = nil)
 
             step_size = group[:lr] / bias_correction1
 
-            p.data.addcdiv!(-step_size, exp_avg, denom)
+            p.data.addcdiv!(exp_avg, denom, value: -step_size)
           end
         end
 
diff --git a/lib/torch/optim/adamax.rb b/lib/torch/optim/adamax.rb
index 64fe5954..aeb878cf 100644
--- a/lib/torch/optim/adamax.rb
+++ b/lib/torch/optim/adamax.rb
@@ -57,7 +57,7 @@ def step(closure = nil)
             bias_correction = 1 - beta1 ** state[:step]
             clr = group[:lr] / bias_correction
 
-            p.data.addcdiv!(-clr, exp_avg, exp_inf)
+            p.data.addcdiv!(exp_avg, exp_inf, value: -clr)
           end
         end
 
diff --git a/lib/torch/optim/adamw.rb b/lib/torch/optim/adamw.rb
index b31e8f85..db9608c0 100644
--- a/lib/torch/optim/adamw.rb
+++ b/lib/torch/optim/adamw.rb
@@ -59,7 +59,7 @@ def step(closure = nil)
 
             # Decay the first and second moment running average coefficient
             exp_avg.mul!(beta1).add!(grad, alpha: 1 - beta1)
-            exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad)
+            exp_avg_sq.mul!(beta2).addcmul!(grad, grad, value: 1 - beta2)
             if amsgrad
               # Maintains the maximum of all 2nd moment running avg. till now
               Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq)
@@ -71,7 +71,7 @@ def step(closure = nil)
 
             step_size = group[:lr] / bias_correction1
 
-            p.data.addcdiv!(-step_size, exp_avg, denom)
+            p.data.addcdiv!(exp_avg, denom, value: -step_size)
           end
         end
 
diff --git a/lib/torch/optim/rmsprop.rb b/lib/torch/optim/rmsprop.rb
index 367f07b5..c05d7959 100644
--- a/lib/torch/optim/rmsprop.rb
+++ b/lib/torch/optim/rmsprop.rb
@@ -49,7 +49,7 @@ def step(closure = nil)
               grad = grad.add(p.data, alpha: group[:weight_decay])
             end
 
-            square_avg.mul!(alpha).addcmul!(1 - alpha, grad, grad)
+            square_avg.mul!(alpha).addcmul!(grad, grad, value: 1 - alpha)
 
             if group[:centered]
               grad_avg = state[:grad_avg]
@@ -61,10 +61,10 @@ def step(closure = nil)
 
             if group[:momentum] > 0
               buf = state[:momentum_buffer]
-              buf.mul!(group[:momentum]).addcdiv!(1, grad, avg)
+              buf.mul!(group[:momentum]).addcdiv!(grad, avg, value: 1)
               p.data.add!(buf, alpha: -group[:lr])
             else
-              p.data.addcdiv!(-group[:lr], grad, avg)
+              p.data.addcdiv!(grad, avg, value: -group[:lr])
             end
           end
         end
diff --git a/lib/torch/optim/rprop.rb b/lib/torch/optim/rprop.rb
index 226b4759..cf50d03c 100644
--- a/lib/torch/optim/rprop.rb
+++ b/lib/torch/optim/rprop.rb
@@ -52,7 +52,7 @@ def step(closure = nil)
             grad[sign.eq(etaminus)] = 0
 
             # update parameters
-            p.data.addcmul!(-1, grad.sign, step_size)
+            p.data.addcmul!(grad.sign, step_size, value: -1)
 
             state[:prev].copy!(grad)
           end

From 14b38d77860ca02dea9b6b3d8667aef97e7c208e Mon Sep 17 00:00:00 2001
From: Andrew Kane <andrew@chartkick.com>
Date: Tue, 27 Oct 2020 17:53:16 -0700
Subject: [PATCH 05/28] Updated readme [skip ci]

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5b5a6e43..f389f47f 100644
--- a/README.md
+++ b/README.md
@@ -416,6 +416,7 @@ Here’s the list of compatible versions.
 
 Torch.rb | LibTorch
 --- | ---
+0.5.0+ | 1.7.0
 0.3.0+ | 1.6.0
 0.2.0-0.2.7 | 1.5.0-1.5.1
 0.1.8 | 1.4.0

From 80a601d32ab90e5f1294fbddc832b46d8bff101c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i.razuvaev@zonatelecom.ru>
Date: Thu, 17 Dec 2020 09:54:13 +0000
Subject: [PATCH 06/28] manual_seed and manual_seed_all CUDA methods added

---
 ext/torch/ext.cpp |  6 +++++-
 test/cuda_test.rb | 26 ++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp
index 78097260..cd10498a 100644
--- a/ext/torch/ext.cpp
+++ b/ext/torch/ext.cpp
@@ -16,6 +16,8 @@
 #include "tensor_functions.h"
 #include "nn_functions.h"
 
+//#include "cuda_functions.h"
+
 using namespace Rice;
 using torch::indexing::TensorIndex;
 
@@ -618,5 +620,7 @@ void Init_ext()
   Module rb_mCUDA = define_module_under(rb_mTorch, "CUDA")
     .add_handler<torch::Error>(handle_error)
     .define_singleton_method("available?", &torch::cuda::is_available)
-    .define_singleton_method("device_count", &torch::cuda::device_count);
+    .define_singleton_method("device_count", &torch::cuda::device_count)
+    .define_singleton_method("manual_seed", &torch::cuda::manual_seed)
+    .define_singleton_method("manual_seed_all", &torch::cuda::manual_seed_all);
 }
diff --git a/test/cuda_test.rb b/test/cuda_test.rb
index 2b33f615..ed54e20e 100644
--- a/test/cuda_test.rb
+++ b/test/cuda_test.rb
@@ -26,4 +26,30 @@ def test_tensor
       assert_equal "PyTorch is not linked with support for cuda devices", error.message
     end
   end
+
+  def test_random_seed
+    if Torch::CUDA.available?
+      Torch::CUDA.manual_seed_all 42
+      
+      comparables = Torch::CUDA.device_count.times.map do |i|
+        x, y = 2.times.map { Torch.rand(100, device: "cuda:#{i}").to_a }
+        assert x != y
+        [x, y]
+      end
+      
+      Torch::CUDA.manual_seed_all 42
+      Torch::CUDA.device_count.times.map do |i|
+        x, y = 2.times.map { Torch.rand(100, device: "cuda:#{i}").to_a }
+        assert x != y
+        assert_equal x, comparables[i].first
+        assert_equal y, comparables[i].last
+      end
+    else
+      error = assert_raises do
+        Torch.random 1, device: 'cuda:0'
+      end
+      
+      assert_equal "PyTorch is not linked with support for cuda devices", error.message
+    end
+  end
 end

From 0612de7bd87cc6c9ec34746164553a1c0fcb83f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i.razuvaev@zonatelecom.ru>
Date: Thu, 17 Dec 2020 10:04:29 +0000
Subject: [PATCH 07/28] Unknown parameter in module error message fixed

---
 lib/torch/nn/module.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/torch/nn/module.rb b/lib/torch/nn/module.rb
index 9418bfcb..91c698e9 100644
--- a/lib/torch/nn/module.rb
+++ b/lib/torch/nn/module.rb
@@ -134,7 +134,7 @@ def load_state_dict(state_dict)
                 param.copy!(input_param)
               end
             else
-              raise Error, "Unknown parameter: #{k1}"
+              raise Error, "Unknown parameter `#{k2}` in module `#{k1}`"
             end
           else
             raise Error, "Unknown module: #{k1}"

From 5e117f0cf02698f6b33c9f10805c8b38d4900980 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i.razuvaev@zonatelecom.ru>
Date: Thu, 17 Dec 2020 10:09:49 +0000
Subject: [PATCH 08/28] fixed CUDA random test for non-CUDA environment

---
 test/cuda_test.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cuda_test.rb b/test/cuda_test.rb
index ed54e20e..8d3897f2 100644
--- a/test/cuda_test.rb
+++ b/test/cuda_test.rb
@@ -46,7 +46,7 @@ def test_random_seed
       end
     else
       error = assert_raises do
-        Torch.random 1, device: 'cuda:0'
+        Torch.rand 1, device: 'cuda:0'
       end
       
       assert_equal "PyTorch is not linked with support for cuda devices", error.message

From 987a3003d40a52df434e6d8f6502b0ecc06a6223 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i.razuvaev@zonatelecom.ru>
Date: Thu, 17 Dec 2020 19:39:03 +0000
Subject: [PATCH 09/28] removed useless commented out header

---
 ext/torch/ext.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp
index cd10498a..f0a8da2c 100644
--- a/ext/torch/ext.cpp
+++ b/ext/torch/ext.cpp
@@ -16,8 +16,6 @@
 #include "tensor_functions.h"
 #include "nn_functions.h"
 
-//#include "cuda_functions.h"
-
 using namespace Rice;
 using torch::indexing::TensorIndex;
 

From cecf39813c54fb4e51e9e49ec0df9c9bf219fb22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i.razuvaev@zonatelecom.ru>
Date: Mon, 21 Dec 2020 15:57:59 +0000
Subject: [PATCH 10/28] named buffers load/save

---
 lib/torch/nn/module.rb | 49 +++++++++++++++++++++++++++++-------------
 test/nn/module_test.rb | 13 +++++++++++
 test/support/net.rb    | 23 ++++++++++++++++++++
 3 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/lib/torch/nn/module.rb b/lib/torch/nn/module.rb
index 91c698e9..9b6a459b 100644
--- a/lib/torch/nn/module.rb
+++ b/lib/torch/nn/module.rb
@@ -113,31 +113,41 @@ def call(*input, **kwargs)
         forward(*input, **kwargs)
       end
 
-      def state_dict(destination: nil)
+      def state_dict(destination: nil, prefix: "")
         destination ||= {}
-        named_parameters.each do |k, v|
-          destination[k] = v
+        save_to_state_dict(destination, prefix: prefix)
+        
+        named_children.each do |name, mod|
+          next unless mod
+
+          mod.state_dict(destination: destination, prefix: prefix + name + '.')
         end
         destination
       end
-
+         
       # TODO add strict option
       # TODO match PyTorch behavior
       def load_state_dict(state_dict)
         state_dict.each do |k, input_param|
-          k1, k2 = k.split(".", 2)
-          mod = named_modules[k1]
-          if mod.is_a?(Module)
-            param = mod.named_parameters[k2]
-            if param.is_a?(Parameter)
-              Torch.no_grad do
-                param.copy!(input_param)
-              end
-            else
-              raise Error, "Unknown parameter `#{k2}` in module `#{k1}`"
+          *mods, param_name = k.split(".")
+
+          mods_ok = []
+          mod = mods.inject(self) do |mod, name|
+            child = mod.named_modules[name]
+            raise Error, "Unknown module `#{[mods_ok + name].join '.'}`" unless child.is_a?(Module)
+
+            mods_ok << name
+            child
+          end
+          
+          param = mod.named_parameters[param_name] || mod.named_buffers[param_name]
+          if param.is_a?(Parameter) || param.is_a?(Tensor)
+            Torch.no_grad do
+              param.copy!(input_param)
             end
           else
-            raise Error, "Unknown module: #{k1}"
+            p mod, param.class, mod.named_parameters.keys, mod.named_buffers.keys
+            raise Error, "Unknown parameter `#{param_name}` in module `#{mods.join '.'}`"
           end
         end
 
@@ -300,6 +310,15 @@ def format(str, *vars, **options)
       def dict
         instance_variables.reject { |k| instance_variable_get(k).is_a?(Tensor) }.map { |k| [k[1..-1].to_sym, instance_variable_get(k)] }.to_h
       end
+      
+      def save_to_state_dict(destination, prefix: "")
+        named_parameters(prefix: prefix, recurse: false).each do |k, v|
+          destination[k] = v
+        end
+        named_buffers.each do |k, v|
+          destination[prefix + k] = v
+        end
+      end
     end
   end
 end
diff --git a/test/nn/module_test.rb b/test/nn/module_test.rb
index 0588a3c4..a7cce31c 100644
--- a/test/nn/module_test.rb
+++ b/test/nn/module_test.rb
@@ -70,6 +70,19 @@ def test_state_dict
     # Torch.save(optimizer.state_dict, tmpfile2.path)
   end
 
+  def test_state_dict_with_buffers
+    net = SimpleResidualBlock.new
+    expected_keys = %w[seq.0.weight seq.1.weight seq.1.bias seq.1.running_mean seq.1.running_var seq.1.num_batches_tracked seq.3.weight seq.4.weight seq.4.bias seq.4.running_mean seq.4.running_var seq.4.num_batches_tracked seq.6.weight seq.7.weight seq.7.bias seq.7.running_mean seq.7.running_var seq.7.num_batches_tracked]
+    assert_equal expected_keys, net.state_dict.keys
+
+    tmpfile = Tempfile.new
+    Torch.save net.state_dict, tmpfile.path
+
+    net = SimpleResidualBlock.new
+    net.load_state_dict Torch.load tmpfile.path
+    net.eval
+  end
+
   def test_inspect
     assert_match "(conv1): Conv2d(1, 6, kernel_size: [3, 3], stride: [1, 1])", net.inspect
   end
diff --git a/test/support/net.rb b/test/support/net.rb
index de1fe602..11a2e9a3 100644
--- a/test/support/net.rb
+++ b/test/support/net.rb
@@ -27,3 +27,26 @@ def num_flat_features(x)
     num_features
   end
 end
+
+class SimpleResidualBlock < Torch::NN::Module
+  def initialize
+    super
+
+    @relu = Torch::NN::ReLU.new
+   
+    @seq = Torch::NN::Sequential.new(
+      Torch::NN::Conv2d.new(64, 128, 3, padding: 1, bias: false),
+      Torch::NN::BatchNorm2d.new(128),
+      @relu,
+      Torch::NN::Conv2d.new(128, 128, 3, padding: 1, bias: false),
+      Torch::NN::BatchNorm2d.new(128),
+      @relu,
+      Torch::NN::Conv2d.new(128, 64, 3, bias: false),
+      Torch::NN::BatchNorm2d.new(64)
+    )
+  end
+
+  def forward(x)
+    @relu.forward(@seq.forward(x) + x)
+  end
+end

From b20770cd44b36b78009e52783f2501fede94a090 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i.razuvaev@zonatelecom.ru>
Date: Mon, 21 Dec 2020 17:55:04 +0000
Subject: [PATCH 11/28] debug print removed

---
 lib/torch/nn/module.rb | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/torch/nn/module.rb b/lib/torch/nn/module.rb
index 9b6a459b..a6e4e292 100644
--- a/lib/torch/nn/module.rb
+++ b/lib/torch/nn/module.rb
@@ -146,7 +146,6 @@ def load_state_dict(state_dict)
               param.copy!(input_param)
             end
           else
-            p mod, param.class, mod.named_parameters.keys, mod.named_buffers.keys
             raise Error, "Unknown parameter `#{param_name}` in module `#{mods.join '.'}`"
           end
         end

From ef05f1099ea01d9884a09ef9d934ae0ec8d6345e Mon Sep 17 00:00:00 2001
From: "i.razuvaev" <i.razuvaev@zonatelecom.ru>
Date: Wed, 4 Aug 2021 08:21:51 +0000
Subject: [PATCH 12/28] multihead attention

---
 lib/torch.rb                         |   2 +
 lib/torch/nn/functional_attention.rb | 242 +++++++++++++++++++++++++++
 lib/torch/nn/multihead_attention.rb  | 123 ++++++++++++++
 test/nn/functional_attention_test.rb | 141 ++++++++++++++++
 4 files changed, 508 insertions(+)
 create mode 100644 lib/torch/nn/functional_attention.rb
 create mode 100644 lib/torch/nn/multihead_attention.rb
 create mode 100644 test/nn/functional_attention_test.rb

diff --git a/lib/torch.rb b/lib/torch.rb
index 20996052..620e016a 100644
--- a/lib/torch.rb
+++ b/lib/torch.rb
@@ -132,6 +132,7 @@
 require "torch/nn/softsign"
 require "torch/nn/tanh"
 require "torch/nn/tanhshrink"
+require "torch/nn/multihead_attention"
 
 # nn activations other
 require "torch/nn/log_softmax"
@@ -174,6 +175,7 @@
 
 # nn other
 require "torch/nn/functional"
+require "torch/nn/functional_attention"
 require "torch/nn/init"
 
 # utils
diff --git a/lib/torch/nn/functional_attention.rb b/lib/torch/nn/functional_attention.rb
new file mode 100644
index 00000000..cfc6d3ac
--- /dev/null
+++ b/lib/torch/nn/functional_attention.rb
@@ -0,0 +1,242 @@
+module Torch
+  module NN
+    class Functional
+      class << self
+        def in_projection_packed(q, k, v, w, b: nil)
+          e = q.size(-1)
+
+          if k.eql? v
+            if q.eql? k
+              # self-attention
+              return linear(q, w, b).chunk(3, dim: -1)
+            else
+              # encoder-decoder attention
+              w_q, w_kv = w.split_with_sizes([e, e * 2])
+              if b.nil?
+                b_q = b_kv = nil
+              else
+                b_q, b_kv = b.split_with_sizes([e, e * 2])
+              end
+              
+              return [linear(q, w_q, b_q), *linear(k, w_kv, b_kv).chunk(2, dim: -1)]
+            end
+          else
+            w_q, w_k, w_v = w.chunk(3)
+            if b.nil?
+              b_q = b_k = b_v = None
+            else
+              b_q, b_k, b_v = b.chunk(3)
+            end
+
+            return [linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)]
+          end
+        end
+
+        def in_projection(
+          q, k, v,
+          w_q, w_k, w_v,
+          b_q: nil, b_k: nil, b_v: nil
+        )
+
+          e_q, e_k, e_v = q.size(-1), k.size(-1), v.size(-1)
+
+          raise ArgumentError, "Expecting query weights shape of #{[e_q, e_q]}, but got #{w_q.shape}" unless w_q.shape == [e_q, e_q]
+          raise ArgumentError, "Expecting key weights shape of #{[e_k, e_k]}, but got #{w_k.shape}" unless w_q.shape == [e_k, e_k]
+          raise ArgumentError, "Expecting value weights shape of #{[e_v, e_v]}, but got #{w_v.shape}" unless w_q.shape == [e_v, e_v]
+
+          raise ArgumentError, "Expecting query bias shape of #{[e_q]}, but got #{b_q.shape}" if b_q && b_q.shape != [e_q]
+          raise ArgumentError, "Expecting key bias shape of #{[e_k]}, but got #{b_k.shape}" if b_k && b_k.shape != [e_k]
+          raise ArgumentError, "Expecting value bias shape of #{[e_v]}, but got #{b_v.shape}" if b_v && b_v.shape != [e_v]
+
+          [linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)]
+        end
+
+        def scaled_dot_product_attention(
+          q, k, v,
+          attn_mask: nil, dropout_p: 0.0
+        )
+
+          b, nt, e = q.shape
+
+          q = q / Math.sqrt(e)
+
+          attn = Torch.bmm(q, k.transpose(-2, -1))
+          attn += attn_mask if attn_mask
+          attn = softmax(attn, dim: -1)
+          attn = dropout(attn, p: dropout_p) if dropout_p > 0
+
+          output = Torch.bmm(attn, v)
+
+          [output, attn]
+        end
+
+        def multi_head_attention_forward(
+          query, key, value,
+          embed_dim_to_check, num_heads,
+          in_proj_weight, in_proj_bias,
+          bias_k, bias_v,
+          add_zero_attn,
+          dropout_p,
+          out_proj_weight, out_proj_bias,
+          training: true,
+          key_padding_mask: nil,
+          need_weights: true,
+          attn_mask: nil,
+          use_separate_proj_weight: false,
+          q_proj_weight: nil, k_proj_weight: nil, v_proj_weight: nil,
+          static_k: nil, static_v: nil
+        )
+
+          tgt_len, bsz, embed_dim = query.shape
+          src_len = key.shape.first
+
+          raise ArgumentError, "Was expecting embedding dimension of #{embed_dim_to_check}, but got #{embed_dim}" unless embed_dim == embed_dim_to_check
+
+          head_dim = if embed_dim.is_a?(Torch::Tensor)
+            embed_dim.div(num_heads, rounding_mode: 'trunc')
+          else
+            head_dim = embed_dim.div num_heads
+          end
+
+          if use_separate_proj_weight
+            raise ArgumentError, "Key's sequence and batch dims #{key.shape[...2]} do not match value's #{value.shape[...2]}" unless key.shape[...2] == value.shape[...2]
+          else
+            raise ArgumentError, "Key shape #{key.shape} does not match value shape #{value.shape}" unless key.shape == value.shape
+          end
+
+          
+          # compute in-projection
+          q, k, v = 
+            if use_separate_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but q_proj_weight is nil" unless q_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but k_proj_weight is nil" unless k_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but v_proj_weight is nil" unless v_proj_weight
+
+              if in_proj_bias
+                b_q, b_k, b_v = in_proj_bias.chunk(3)
+              else
+                b_q = b_k = b_v = nil
+              end
+
+              in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q: b_q, b_k: b_k, b_v: b_v)
+            else
+              in_projection_packed(query, key, value, in_proj_weight, b: in_proj_bias)
+            end
+
+          # prep attention mask
+          if attn_mask
+            if attn_mask.dtype == :uint8
+              puts "[WARN] Byte tensor for attn_mask in Multihead Attention is deprecated. Use bool tensor instead."
+              attn_mask = attn_mask.bool
+            else
+              raise ArgumentError, "Only float, byte, and bool types are supported for attn_mask, not #{attn_mask.dtype}" unless attn_mask.floating_point? or attn_mask.dtype == :bool
+            end
+
+            if attn_mask.dim == 2
+              correct_2d_size = [tgt_len, src_len]
+              raise ArgumentError, "The shape of the 2D attn_mask is #{attn_mask.shape}, but should be #{correct_2d_size}." unless attn_mask.shape == correct_2d_size
+
+              attn_mask = attn_mask.unsqueeze(0)
+            elsif attn_mask.dim == 3
+              correct_3d_size = [bsz * num_heads, tgt_len, src_len]
+              raise ArgumentError, "The shape of the 3D attn_mask is #{attn_mask.shape}, but should be #{correct_3d_size}." unless attn_mask.shape == correct_3d_size
+            else
+              raise ArgumentError, "attn_mask's dimension #{attn_mask.dim} is not supported"
+            end
+          end
+
+          # prep key padding mask
+          if key_padding_mask && key_padding_mask.dtype == :uint8
+            puts "[WARN] Byte tensor for key_padding_mask in Multihead Attention is deprecated. Use bool tensor instead."
+            key_padding_mask = key_padding_mask.bool
+          end
+
+          # add bias along batch dimension (currently second)
+          if bias_k && bias_v
+            raise ArgumentError, "bias cannot be added to static key." if static_k
+            raise ArgumentError, "bias cannot be added to static value." if static_v
+
+            k = Torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = Torch.cat([v, bias_v.repeat(1, bsz, 1)])
+
+            attn_mask = pad(attn_mask, [0, 1]) if attn_mask
+            key_padding_mask = pad(key_padding_mask, [0, 1]) if key_padding_mask
+          else
+            raise ArgumentError unless bias_k.nil?
+            raise ArgumentError unless bias_v.nil?
+          end
+
+          # reshape q, k, v for multihead attention and make em batch first
+          q = q.contiguous.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+
+          if static_k.nil?
+            k = k.contiguous.view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+          else
+            raise ArgumentError, "Expecting static_k.size(0) of #{bsz * num_heads}, but got #{static_k.size(0)}" unless static_k.size(0) == bsz * num_heads
+            raise ArgumentError, "Expecting static_k.size(2) of #{bsz * num_heads}, but got #{static_k.size(2)}" unless static_k.size(2) == bsz * num_heads
+            
+            k = static_k
+          end
+
+          if static_v.nil?
+            v = v.contiguous.view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+          else
+            raise ArgumentError, "Expecting static_v.size(0) of #{bsz * num_heads}, but got #{static_v.size(0)}" unless static_v.size(0) == bsz * num_heads
+            raise ArgumentError, "Expecting static_v.size(2) of #{bsz * num_heads}, but got #{static_v.size(2)}" unless static_v.size(2) == bsz * num_heads
+            
+            v = static_v
+          end
+
+          # add zero attention along batch dimension (now first)
+          if add_zero_attn
+            zero_attn_shape = [bsz * num_heads, 1, head_dim]
+            k = Torch.cat([k, Torch.zeros(zero_attn_shape, dtype: k.dtype, device: k.device)], dim: 1)
+            v = Torch.cat([v, Torch.zeros(zero_attn_shape, dtype: v.dtype, device: v.device)], dim: 1)
+
+            attn_mask = pad(attn_mask, [0, 1]) if attn_mask
+            key_padding_mask = pad(key_padding_mask, [0, 1]) if key_padding_mask
+          end
+
+          # update source sequence length after adjustments
+          src_len = k.size(1)
+
+          # merge key padding and attention masks
+          if key_padding_mask
+            raise ArgumentError, "Expecting key_padding_mask shape of #{[bsz, src_len]}, but got #{key_padding_mask.shape}" unless key_padding_mask.shape == [bsz, src_len]
+
+            key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
+
+            attn_mask = if attn_mask.nil?
+              key_padding_mask
+            elsif attn_mask.dtype == :bool
+              attn_mask.logical_or(key_padding_mask)
+            else
+              attn_mask.masked_fill(key_padding_mask, -Float::INFINITY)
+            end
+          end
+
+          # convert mask to float
+          if attn_mask && attn_mask.dtype == :bool
+            new_attn_mask = Torch.zeros_like(attn_mask, dtype: :float32)
+            attn_mask = new_attn_mask.masked_fill(attn_mask, -Float::INFINITY)
+          end
+
+          dropout_p = 0.0 unless training
+
+          # (deep breath) calculate attention and out projection
+          attn_output, attn_output_weights = scaled_dot_product_attention(q, k, v, attn_mask: attn_mask, dropout_p: dropout_p)
+          attn_output = attn_output.transpose(0, 1).contiguous.view(tgt_len, bsz, embed_dim)
+          attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+
+          if need_weights
+            # average attention weights over heads
+            attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+            [attn_output, attn_output_weights.sum(dim: 1) / num_heads]
+          else
+            [attn_output, nil]
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/lib/torch/nn/multihead_attention.rb b/lib/torch/nn/multihead_attention.rb
new file mode 100644
index 00000000..566fb3c5
--- /dev/null
+++ b/lib/torch/nn/multihead_attention.rb
@@ -0,0 +1,123 @@
+module Torch
+  module NN
+    class MultiheadAttention < Module
+      def initialize(
+        embed_dim, num_heads,
+        dropout: 0.0, bias: true, add_bias_kv: false, add_zero_attn: false,
+        kdim: nil, vdim: nil, batch_first: false, device: nil, dtype: nil
+      )
+
+        super()
+
+        @embed_dim = embed_dim
+        @kdim = kdim || @embed_dim
+        @vdim = vdim || @embed_dim
+
+        @qkv_same_embed_dim = @kdim == @embed_dim && @vdim == @embed_dim
+
+        @num_heads = num_heads
+        @dropout = dropout
+        @batch_first = batch_first
+        
+        @head_dim = @embed_dim.div @num_heads
+
+        raise ArgumentError, "embed_dim must be divisible by num_heads" unless @head_dim * @num_heads == @embed_dim
+
+        if @qkv_same_embed_dim
+          @in_proj_weight = Parameter.new(Torch.empty([3 * @embed_dim, @embed_dim]))
+          %w(q k v).each { |x| register_parameter("#{x}_proj_weight", nil) }
+        else
+          @q_proj_weight = Parameter.new(Torch.empty([@embed_dim, @embed_dim]))
+          @k_proj_weight = Parameter.new(Torch.empty([@embed_dim, @kdim]))
+          @v_proj_weight = Parameter.new(Torch.empty([@embed_dim, @vdim]))
+
+          register_parameter('in_proj_weight', nil)
+        end
+
+        if bias
+          @in_proj_bias = Parameter.new(Torch.empty(3 * @embed_dim))
+        else
+          register_parameter('in_proj_bias', nil)
+        end
+
+        @out_proj = Linear.new(@embed_dim, @embed_dim, bias: bias)
+
+        if add_bias_kv
+          @bias_k = Parameter.new(Torch.empty([1, 1, @embed_dim]))
+          @bias_v = Parameter.new(Torch.empty([1, 1, @embed_dim]))
+        else
+          @bias_k = @bias_v = nil
+        end
+
+        @add_zero_attn = add_zero_attn
+
+        reset_parameters
+      end
+
+      def batch_first?
+        !!@batch_first
+      end
+
+      def reset_parameters
+        if @qkv_same_embed_dim
+          Init.xavier_uniform!(@in_proj_weight)
+        else
+          Init.xavier_uniform!(@q_proj_weight)
+          Init.xavier_uniform!(@k_proj_weight)
+          Init.xavier_uniform!(@v_proj_weight)
+        end
+
+        if @in_proj_bias
+          Init.constant!(@in_proj_bias, 0.0)
+          Init.constant!(@out_proj.bias, 0.0)
+        end
+
+        Init.xavier_uniform!(@bias_k) if @bias_k
+        Init.xavier_uniform!(@bias_v) if @bias_v
+      end
+
+      def forward(
+        query, key, value,
+        key_padding_mask: nil, need_weights: true, attn_mask: nil
+      )
+
+        if batch_first?
+          query, key, value = [query, key, value].map { |t| t.transpose(1, 0) }
+        end
+
+        attn_output, attn_output_weights = 
+          if @qkv_same_embed_dim
+            F.multi_head_attention_forward(
+              query, key, value,
+              @embed_dim, @num_heads, 
+              @in_proj_weight, @in_proj_bias,
+              @bias_k, @bias_v, @ad_zero_attn,
+              @dropout, @out_proj.weight, @out_proj.bias,
+              training: @training,
+              key_padding_mask: key_padding_mask,
+              need_weights: need_weights,
+              attn_mask: attn_mask
+            )
+          else
+            F.multi_head_attention_forward(
+              query, key, value,
+              @embed_dim, @num_heads, 
+              @in_proj_weight, @in_proj_bias,
+              @bias_k, @bias_v, @ad_zero_attn,
+              @dropout, @out_proj.weight, @out_proj.bias,
+              training: @training,
+              key_padding_mask: key_padding_mask,
+              need_weights: need_weights,
+              attn_mask: attn_mask,
+              use_separate_proj_weight: true, 
+              q_proj_weight: q_proj_weight, k_proj_weight: @k_proj_weight, v_proj_weight: @v_proj_weight
+            )
+          end
+
+        attn_output = attn_output.transpose(1, 0) if batch_first?
+
+        [attn_output, attn_output_weights]
+      end
+    end
+  end
+end
diff --git a/test/nn/functional_attention_test.rb b/test/nn/functional_attention_test.rb
new file mode 100644
index 00000000..de18a7cb
--- /dev/null
+++ b/test/nn/functional_attention_test.rb
@@ -0,0 +1,141 @@
+require_relative '../test_helper'
+
+class FunctionalAttentionTest < Minitest::Test
+  T = 4
+  S = 8
+  B = 2
+  E = 6
+
+  SEED = 42
+
+  def test_self_attention_no_mask
+    t = Torch.ones([T, B, E])
+    Torch.manual_seed SEED
+    attn = Torch::NN::MultiheadAttention.new E, 2
+    out, weights = attn.(t, t, t)
+
+    expected_out = Torch.tensor([
+      [[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+       [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]],
+
+      [[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+       [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]],
+
+      [[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+       [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]],
+
+      [[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+       [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]]
+    ])
+
+    expected_weights = Torch.tensor([
+      [[0.2500, 0.2500, 0.2500, 0.2500],
+       [0.2500, 0.2500, 0.2500, 0.2500],
+       [0.2500, 0.2500, 0.2500, 0.2500],
+       [0.2500, 0.2500, 0.2500, 0.2500]],
+
+      [[0.2500, 0.2500, 0.2500, 0.2500],
+       [0.2500, 0.2500, 0.2500, 0.2500],
+       [0.2500, 0.2500, 0.2500, 0.2500],
+       [0.2500, 0.2500, 0.2500, 0.2500]]
+    ])
+
+    assert_equal out.shape, expected_out.shape
+    assert_equal weights.shape, expected_weights.shape
+
+    [[out.detach, expected_out], [weights.detach, expected_weights]].each do |(a, b)|
+      assert (a - b).abs.lt(1e-6).all
+    end
+  end
+  
+  def test_self_attention_with_masks
+    t = Torch.ones([T, B, E])
+    Torch.manual_seed SEED
+    attn = Torch::NN::MultiheadAttention.new E, 2
+
+    attn_mask = Torch.triu(Torch.ones([T, T]), diagonal: 1).eq(1)
+    key_padding_mask = Torch.triu(Torch.zeros(B, T))
+    key_padding_mask[0, -1] = 1
+
+    out, weights = attn.(t, t, t, attn_mask: attn_mask, key_padding_mask: key_padding_mask)
+
+    expected_out = Torch.tensor([
+      [[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+			 [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]],
+
+			[[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+			 [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]],
+
+			[[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+			 [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]],
+
+			[[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+			 [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]]
+    ])
+
+    expected_weights = Torch.tensor([
+      [[1.0000, 0.0000, 0.0000, 0.0000],
+			 [0.5000, 0.5000, 0.0000, 0.0000],
+			 [0.3333, 0.3333, 0.3333, 0.0000],
+			 [0.3333, 0.3333, 0.3333, 0.0000]],
+
+			[[1.0000, 0.0000, 0.0000, 0.0000],
+			 [0.5000, 0.5000, 0.0000, 0.0000],
+			 [0.3333, 0.3333, 0.3333, 0.0000],
+			 [0.2500, 0.2500, 0.2500, 0.2500]]
+    ])
+
+    assert_equal out.shape, expected_out.shape
+    assert_equal weights.shape, expected_weights.shape
+
+    [[out.detach, expected_out], [weights.detach, expected_weights]].each do |(a, b)|
+      assert (a - b).abs.lt(1e-6).all
+    end
+  end
+
+  def test_encoder_decoder_attention
+    q = Torch.ones([T, B, E])
+    k = v = Torch.ones([S, B, E])
+    Torch.manual_seed SEED
+    attn = Torch::NN::MultiheadAttention.new E, 2
+
+    attn_mask = Torch.triu(Torch.ones([T, S]), diagonal: 1).eq(1)
+    key_padding_mask = Torch.triu(Torch.zeros(B, S))
+    key_padding_mask[0, -1] = 1
+
+    out, weights = attn.(q, k, v, attn_mask: attn_mask, key_padding_mask: key_padding_mask)
+
+    expected_out = Torch.tensor([
+      [[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+			 [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]],
+
+			[[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+			 [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]],
+
+			[[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+			 [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]],
+
+			[[-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357],
+			 [-1.2826,  0.4973, -0.3479,  0.3659,  0.6462,  0.1357]]
+    ])
+
+    expected_weights = Torch.tensor([
+      [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
+			 [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
+			 [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
+			 [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000]],
+
+			[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
+			 [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
+			 [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
+			 [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000]]
+    ])
+    
+    assert_equal out.shape, expected_out.shape
+    assert_equal weights.shape, expected_weights.shape
+
+    [[out.detach, expected_out], [weights.detach, expected_weights]].each do |(a, b)|
+      assert (a - b).abs.lt(1e-6).all
+    end
+  end
+end

From d2edbab39079c58884feae8e14d5982d4aa46ea8 Mon Sep 17 00:00:00 2001
From: "i.razuvaev" <i.razuvaev@zonatelecom.ru>
Date: Wed, 4 Aug 2021 14:25:47 +0000
Subject: [PATCH 13/28] removed endless range for respecting dying ruby 2.6

---
 lib/torch/nn/functional_attention.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/torch/nn/functional_attention.rb b/lib/torch/nn/functional_attention.rb
index cfc6d3ac..8e3785c6 100644
--- a/lib/torch/nn/functional_attention.rb
+++ b/lib/torch/nn/functional_attention.rb
@@ -99,7 +99,7 @@ def multi_head_attention_forward(
           end
 
           if use_separate_proj_weight
-            raise ArgumentError, "Key's sequence and batch dims #{key.shape[...2]} do not match value's #{value.shape[...2]}" unless key.shape[...2] == value.shape[...2]
+            raise ArgumentError, "Key's sequence and batch dims #{key.shape[0...2]} do not match value's #{value.shape[0...2]}" unless key.shape[0...2] == value.shape[0...2]
           else
             raise ArgumentError, "Key shape #{key.shape} does not match value shape #{value.shape}" unless key.shape == value.shape
           end

From f9e9e86d7290c4636fbf6e9c691ed31ac16de4f2 Mon Sep 17 00:00:00 2001
From: "i.razuvaev" <i.razuvaev@zonatelecom.ru>
Date: Wed, 4 Aug 2021 14:25:58 +0000
Subject: [PATCH 14/28] module list

---
 lib/torch/nn/module.rb | 53 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/lib/torch/nn/module.rb b/lib/torch/nn/module.rb
index 145ec26e..5fa0dade 100644
--- a/lib/torch/nn/module.rb
+++ b/lib/torch/nn/module.rb
@@ -387,5 +387,58 @@ def save_to_state_dict(destination, prefix: "")
         end
       end
     end
+
+    class ModuleList < Module
+      def initialize(mods = nil)
+        super()
+
+        return unless mods
+        self.extend(mods)
+      end
+
+      def length
+        @modules.length
+      end
+
+      alias :count :length
+
+      def extend(mods)
+        raise ArgumentError, "Modules should respond to #each" unless mods.respond_to?(:each)
+
+        mods.each { |m| append m }
+
+        self
+      end
+
+      def each(&block)
+        @modules.values.each &block
+      end
+
+      def map(&block)
+        @modules.values.map &block
+      end
+
+      def inject(inj, &block)
+        @modules.values.inject(inj, &block)
+      end
+
+      def append(mod)
+        raise ArgumentError, "Provided element is not a module" unless mod.is_a?(Module)
+        add_module(length.to_s, mod)
+        self
+      end
+
+      def [](*idx)
+        idx.map do |id|
+          if id.is_a?(Integer)
+            @modules[id.to_s]
+          elsif id.is_a?(Range)
+            id.each do |i|
+              @modules[i.to_s]
+            end
+          end
+        end.flatten
+      end
+    end
   end
 end

From 898b4dd7a0df57e8be78890da07a7d479a890240 Mon Sep 17 00:00:00 2001
From: "i.razuvaev" <i.razuvaev@zonatelecom.ru>
Date: Wed, 4 Aug 2021 14:27:08 +0000
Subject: [PATCH 15/28] Transformer: attention is all you need

---
 lib/torch.rb                              |   5 +-
 lib/torch/nn/transformer.rb               | 103 ++++++++++++++++++++
 lib/torch/nn/transformer_decoder.rb       |  23 +++++
 lib/torch/nn/transformer_decoder_layer.rb |  56 +++++++++++
 lib/torch/nn/transformer_encoder.rb       |  23 +++++
 lib/torch/nn/transformer_encoder_layer.rb |  48 ++++++++++
 test/nn/transformer_test.rb               | 110 ++++++++++++++++++++++
 7 files changed, 367 insertions(+), 1 deletion(-)
 create mode 100644 lib/torch/nn/transformer.rb
 create mode 100644 lib/torch/nn/transformer_decoder.rb
 create mode 100644 lib/torch/nn/transformer_decoder_layer.rb
 create mode 100644 lib/torch/nn/transformer_encoder.rb
 create mode 100644 lib/torch/nn/transformer_encoder_layer.rb
 create mode 100644 test/nn/transformer_test.rb

diff --git a/lib/torch.rb b/lib/torch.rb
index 620e016a..019170da 100644
--- a/lib/torch.rb
+++ b/lib/torch.rb
@@ -132,7 +132,6 @@
 require "torch/nn/softsign"
 require "torch/nn/tanh"
 require "torch/nn/tanhshrink"
-require "torch/nn/multihead_attention"
 
 # nn activations other
 require "torch/nn/log_softmax"
@@ -144,6 +143,10 @@
 require "torch/nn/embedding"
 require "torch/nn/embedding_bag"
 
+# attention is all you need
+require "torch/nn/multihead_attention"
+require "torch/nn/transformer"
+
 # nn distance functions
 require "torch/nn/cosine_similarity"
 require "torch/nn/pairwise_distance"
diff --git a/lib/torch/nn/transformer.rb b/lib/torch/nn/transformer.rb
new file mode 100644
index 00000000..12a93ae4
--- /dev/null
+++ b/lib/torch/nn/transformer.rb
@@ -0,0 +1,103 @@
+require_relative 'transformer_encoder_layer'
+require_relative 'transformer_encoder'
+require_relative 'transformer_decoder_layer'
+require_relative 'transformer_decoder'
+
+module Torch
+  module NN
+    class Transformer < Module
+      def initialize(
+        d_model: 512, nhead: 8,
+        num_encoder_layers: 6, num_decoder_layers: 6,
+        dim_feedforward: 2048, dropout: 0.1, activation: :relu, 
+        custom_encoder: nil, custom_decoder: nil,
+        layer_norm_eps: 1e-5, batch_first: false
+      )
+
+        super()
+
+        @encoder = 
+          if custom_encoder
+            custom_encoder
+          else
+            encoder_layer = TransformerEncoderLayer.new(
+              d_model, nhead, 
+              dim_feedforward: dim_feedforward, dropout: dropout, activation: activation, 
+              layer_norm_eps: layer_norm_eps, batch_first: batch_first
+            )
+            encoder_norm = LayerNorm.new(d_model, eps: layer_norm_eps)
+            TransformerEncoder.new(encoder_layer, num_encoder_layers, norm: encoder_norm)
+          end
+
+        @decoder = 
+          if custom_decoder
+            custom_decoder
+          else
+            decoder_layer = TransformerDecoderLayer.new(
+              d_model, nhead, 
+              dim_feedforward: dim_feedforward, dropout: dropout, activation: activation, 
+              layer_norm_eps: layer_norm_eps, batch_first: batch_first
+            )
+            decoder_norm = LayerNorm.new(d_model, eps: layer_norm_eps)
+            TransformerDecoder.new(decoder_layer, num_decoder_layers, norm: decoder_norm)
+          end
+
+        reset_parameters
+
+        @d_model = d_model
+        @nhead = nhead
+        @batch_first = batch_first
+      end
+
+      attr_reader :d_model, :nhead, :encoder, :decoder
+
+      def batch_first?
+        !!@batch_first
+      end
+
+      def reset_parameters
+        parameters.each { |p| Init.xavier_uniform!(p) if p.dim > 1 }
+      end
+
+      def forward(
+        src, tgt,
+        src_mask: nil, tgt_mask: nil, memory_mask: nil,
+        src_key_padding_mask: nil, tgt_key_padding_mask: nil, memory_key_padding_mask: nil
+      )
+        
+        if (!batch_first? && src.size(1) != tgt.size(1)) ||
+          (batch_first? && src.size(0) != tgt.size(0))
+          
+          raise ArgumentError, "The batch number of src and tgt must be equal"
+        end
+
+        if src.size(2) != d_model || tgt.size(2) != d_model
+          raise ArgumentError, "The feature number of src and tgt must be equal to d_model"
+        end
+
+        memory = @encoder.(src, mask: src_mask, src_key_padding_mask: src_key_padding_mask)
+        @decoder.(
+          tgt, memory, 
+          tgt_mask: tgt_mask, memory_mask: memory_mask,
+          tgt_key_padding_mask: tgt_key_padding_mask, memory_key_padding_mask: memory_key_padding_mask
+        )
+      end
+
+      class << self
+        def generate_square_subsequent_mask(sz)
+          mask = Torch.triu(Torch.ones([sz, sz])).eq(1).transpose(0, 1)
+          mask.float.masked_fill!(mask.eq(0), -Float::INFINITY).masked_fill!(mask.eq(1), 0.0)
+          mask
+        end
+
+        alias :square_subsequent_mask :generate_square_subsequent_mask
+      end
+
+      def generate_square_subsequent_mask(sz)
+        self.class.square_subsequent_mask(sz)
+      end
+
+      alias :square_subsequent_mask :generate_square_subsequent_mask
+    end
+  end
+end
diff --git a/lib/torch/nn/transformer_decoder.rb b/lib/torch/nn/transformer_decoder.rb
new file mode 100644
index 00000000..6a985853
--- /dev/null
+++ b/lib/torch/nn/transformer_decoder.rb
@@ -0,0 +1,23 @@
+module Torch
+  module NN
+    class TransformerDecoder < Module
+      def initialize(decoder_layer, num_layers, norm: nil)
+        super()
+
+        state = decoder_layer.state_dict
+        layers = num_layers.times.map do |i|
+          decoder_layer.clone.tap { |l| l.load_state_dict(state) }
+        end
+        @layers = ModuleList.new(layers)
+
+        @num_layers = num_layers
+        @norm = norm
+      end
+
+      def forward(tgt, memory, tgt_mask: nil, memory_mask: nil, tgt_key_padding_mask: nil, memory_key_padding_mask: nil)
+        out = @layers.inject(tgt) { |kv, l| l.(kv, memory, tgt_mask: tgt_mask, memory_mask: memory_mask, tgt_key_padding_mask: tgt_key_padding_mask, memory_key_padding_mask: memory_key_padding_mask) }
+        @norm ? @norm.(out) : out
+      end
+    end
+  end
+end
diff --git a/lib/torch/nn/transformer_decoder_layer.rb b/lib/torch/nn/transformer_decoder_layer.rb
new file mode 100644
index 00000000..139fd36b
--- /dev/null
+++ b/lib/torch/nn/transformer_decoder_layer.rb
@@ -0,0 +1,56 @@
+module Torch
+  module NN
+    class TransformerDecoderLayer < Module
+      def initialize(
+        d_model, n_head, 
+        dim_feedforward: 2048, dropout: 0.1, activation: :relu,
+        layer_norm_eps: 1e-5, batch_first: false
+      )
+
+        super()
+
+        @self_attn = MultiheadAttention.new(d_model, n_head, dropout: dropout, batch_first: batch_first)
+        @multihead_attn = MultiheadAttention.new(d_model, n_head, dropout: dropout, batch_first: batch_first)
+        
+        @linear1 = Linear.new(d_model, dim_feedforward)
+        @dropout = Dropout.new(p: dropout)
+        @linear2 = Linear.new(dim_feedforward, d_model)
+
+        @norm1 = LayerNorm.new(d_model, eps: layer_norm_eps)
+        @norm2 = LayerNorm.new(d_model, eps: layer_norm_eps)
+        @norm3 = LayerNorm.new(d_model, eps: layer_norm_eps)
+
+        @dropout1 = Dropout.new(p: dropout)
+        @dropout2 = Dropout.new(p: dropout)
+        @dropout3 = Dropout.new(p: dropout)
+
+        @activation = activation_fn(activation)
+      end
+
+      def forward(tgt, memory, tgt_mask: nil, memory_mask: nil, tgt_key_padding_mask: nil, memory_key_padding_mask: nil)
+        tmp = @self_attn.(tgt, tgt, tgt, attn_mask: tgt_mask, key_padding_mask: tgt_key_padding_mask).first
+        out = tgt + @dropout1.(tmp)
+        out = @norm1.(out)
+
+        tmp = @multihead_attn.(tgt, memory, memory, attn_mask: memory_mask, key_padding_mask: memory_key_padding_mask).first
+        out += @dropout2.(tmp)
+        out = @norm2.(out)
+        
+        tmp = @activation.(@linear1.(out))
+        tmp = @linear2.(@dropout.(tmp))
+        out += @dropout2.(tmp)
+
+        @norm3.(out)
+      end
+
+      private
+      def activation_fn(activation)
+        case activation.to_sym
+        when :relu then F.method(:relu)
+        when :gelu then F.method(:gelu)
+        else raise ArgumentError, "Activation should be relu/gelu, not `#{activation}`"
+        end
+      end
+    end
+  end
+end
diff --git a/lib/torch/nn/transformer_encoder.rb b/lib/torch/nn/transformer_encoder.rb
new file mode 100644
index 00000000..4f4a245e
--- /dev/null
+++ b/lib/torch/nn/transformer_encoder.rb
@@ -0,0 +1,23 @@
+module Torch
+  module NN
+    class TransformerEncoder < Module
+      def initialize(encoder_layer, num_layers, norm: nil)
+        super()
+
+        state = encoder_layer.state_dict
+        layers = num_layers.times.map do |i|
+          encoder_layer.clone.tap { |l| l.load_state_dict(state) }
+        end
+        @layers = ModuleList.new(layers)
+
+        @num_layers = num_layers
+        @norm = norm
+      end
+
+      def forward(src, mask: nil, src_key_padding_mask: nil)
+        out = @layers.inject(src) { |q, l| l.(q, src_mask: mask, src_key_padding_mask: src_key_padding_mask) }
+        @norm ? @norm.(out) : out
+      end
+    end
+  end
+end
diff --git a/lib/torch/nn/transformer_encoder_layer.rb b/lib/torch/nn/transformer_encoder_layer.rb
new file mode 100644
index 00000000..c9aa5589
--- /dev/null
+++ b/lib/torch/nn/transformer_encoder_layer.rb
@@ -0,0 +1,48 @@
+module Torch
+  module NN
+    class TransformerEncoderLayer < Module
+      def initialize(
+        d_model, n_head, 
+        dim_feedforward: 2048, dropout: 0.1, activation: :relu,
+        layer_norm_eps: 1e-5, batch_first: false
+      )
+
+        super()
+
+        @self_attn = MultiheadAttention.new(d_model, n_head, dropout: dropout, batch_first: batch_first)
+        @linear1 = Linear.new(d_model, dim_feedforward)
+        @dropout = Dropout.new(p: dropout)
+        @linear2 = Linear.new(dim_feedforward, d_model)
+
+        @norm1 = LayerNorm.new(d_model, eps: layer_norm_eps)
+        @norm2 = LayerNorm.new(d_model, eps: layer_norm_eps)
+
+        @dropout1 = Dropout.new(p: dropout)
+        @dropout2 = Dropout.new(p: dropout)
+
+        @activation = activation_fn(activation)
+      end
+
+      def forward(src, src_mask: nil, src_key_padding_mask: nil)
+        tmp = @self_attn.(src, src, src, attn_mask: src_mask, key_padding_mask: src_key_padding_mask).first
+        out = src + @dropout1.(tmp)
+        out = @norm1.(out)
+        
+        tmp = @activation.(@linear1.(out))
+        tmp = @linear2.(@dropout.(tmp))
+        out += @dropout2.(tmp)
+        
+        @norm2.(out)
+      end
+
+      private
+      def activation_fn(activation)
+        case activation.to_sym
+        when :relu then F.method(:relu)
+        when :gelu then F.method(:gelu)
+        else raise ArgumentError, "Activation should be relu/gelu, not `#{activation}`"
+        end
+      end
+    end
+  end
+end
diff --git a/test/nn/transformer_test.rb b/test/nn/transformer_test.rb
new file mode 100644
index 00000000..385818ae
--- /dev/null
+++ b/test/nn/transformer_test.rb
@@ -0,0 +1,110 @@
+require_relative '../test_helper'
+
+class TranformerTest < Minitest::Test
+  T = 4
+  S = 8
+  B = 2
+  E = 6
+
+  SEED = 42
+
+  NHEAD = 2
+
+  def test_transformer_encoder
+    Torch.manual_seed SEED
+    src = Torch.randn(S, B, E)
+    layer = Torch::NN::TransformerEncoderLayer.new(E, NHEAD)
+    encoder = Torch::NN::TransformerEncoder.new(layer, 4)
+
+    expected_keys = ['layers.0.self_attn.in_proj_weight', 'layers.0.self_attn.in_proj_bias', 'layers.0.self_attn.out_proj.weight', 'layers.0.self_attn.out_proj.bias', 'layers.0.linear1.weight', 'layers.0.linear1.bias', 'layers.0.linear2.weight', 'layers.0.linear2.bias', 'layers.0.norm1.weight', 'layers.0.norm1.bias', 'layers.0.norm2.weight', 'layers.0.norm2.bias', 'layers.1.self_attn.in_proj_weight', 'layers.1.self_attn.in_proj_bias', 'layers.1.self_attn.out_proj.weight', 'layers.1.self_attn.out_proj.bias', 'layers.1.linear1.weight', 'layers.1.linear1.bias', 'layers.1.linear2.weight', 'layers.1.linear2.bias', 'layers.1.norm1.weight', 'layers.1.norm1.bias', 'layers.1.norm2.weight', 'layers.1.norm2.bias', 'layers.2.self_attn.in_proj_weight', 'layers.2.self_attn.in_proj_bias', 'layers.2.self_attn.out_proj.weight', 'layers.2.self_attn.out_proj.bias', 'layers.2.linear1.weight', 'layers.2.linear1.bias', 'layers.2.linear2.weight', 'layers.2.linear2.bias', 'layers.2.norm1.weight', 'layers.2.norm1.bias', 'layers.2.norm2.weight', 'layers.2.norm2.bias', 'layers.3.self_attn.in_proj_weight', 'layers.3.self_attn.in_proj_bias', 'layers.3.self_attn.out_proj.weight', 'layers.3.self_attn.out_proj.bias', 'layers.3.linear1.weight', 'layers.3.linear1.bias', 'layers.3.linear2.weight', 'layers.3.linear2.bias', 'layers.3.norm1.weight', 'layers.3.norm1.bias', 'layers.3.norm2.weight', 'layers.3.norm2.bias']
+    assert_equal Set.new(encoder.state_dict.keys), Set.new(expected_keys)
+
+    out = encoder.(src).detach
+
+    expected_out = Torch.tensor([
+      [[ 0.7493,  0.4482, -2.1426,  0.5586,  0.5540, -0.1676],
+       [-1.7787,  1.3332, -0.3269, -0.2184,  0.9501,  0.0408]],
+
+      [[ 0.0258, -0.3633,  0.4725, -0.5102,  1.8175, -1.4423],
+       [-0.8428,  0.8163, -1.7820,  0.9993,  0.1579,  0.6513]],
+
+      [[-0.8899,  0.4441, -0.8299,  0.1568,  1.9144, -0.7954],
+       [ 0.9666, -1.8733,  1.0490,  0.3950, -0.5475,  0.0102]],
+
+      [[-0.7694,  1.4112, -0.7571, -0.2797,  1.3567, -0.9616],
+       [-0.8945,  1.2717,  1.4981, -0.8380, -0.2971, -0.7402]],
+
+      [[ 1.3992, -1.0341, -1.3842, -0.0247,  0.0162,  1.0276],
+       [-0.8861,  0.9142, -0.5524,  0.8005,  1.1647, -1.4410]],
+
+      [[ 0.1054, -1.9251, -0.0421,  0.2794,  1.4807,  0.1016],
+       [-0.5518, -0.8835, -0.7934,  0.6458,  1.9350, -0.3522]],
+
+      [[ 1.3186, -1.4948, -1.1052,  0.1480,  0.3011,  0.8324],
+       [-1.0710,  1.1253, -1.0413, -0.5237,  1.4925,  0.0183]],
+
+      [[ 0.9012, -1.3407,  0.7998, -0.7706, -0.8129,  1.2232],
+       [ 0.5637, -1.5301,  1.0149,  1.2128, -0.7807, -0.4805]]
+    ])
+
+    assert_equal out.shape, expected_out.shape
+    assert (expected_out - out).abs.lt(1e-6).all
+  end
+  
+  def test_transformer_decoder
+    Torch.manual_seed SEED
+    memory = Torch.randn([S, B, E])
+    tgt = Torch.randn(T, B, E)
+    layer = Torch::NN::TransformerDecoderLayer.new(E, NHEAD)
+    decoder = Torch::NN::TransformerDecoder.new(layer, 4)
+    
+    expected_keys = ['layers.0.self_attn.in_proj_weight', 'layers.0.self_attn.in_proj_bias', 'layers.0.self_attn.out_proj.weight', 'layers.0.self_attn.out_proj.bias', 'layers.0.multihead_attn.in_proj_weight', 'layers.0.multihead_attn.in_proj_bias', 'layers.0.multihead_attn.out_proj.weight', 'layers.0.multihead_attn.out_proj.bias', 'layers.0.linear1.weight', 'layers.0.linear1.bias', 'layers.0.linear2.weight', 'layers.0.linear2.bias', 'layers.0.norm1.weight', 'layers.0.norm1.bias', 'layers.0.norm2.weight', 'layers.0.norm2.bias', 'layers.0.norm3.weight', 'layers.0.norm3.bias', 'layers.1.self_attn.in_proj_weight', 'layers.1.self_attn.in_proj_bias', 'layers.1.self_attn.out_proj.weight', 'layers.1.self_attn.out_proj.bias', 'layers.1.multihead_attn.in_proj_weight', 'layers.1.multihead_attn.in_proj_bias', 'layers.1.multihead_attn.out_proj.weight', 'layers.1.multihead_attn.out_proj.bias', 'layers.1.linear1.weight', 'layers.1.linear1.bias', 'layers.1.linear2.weight', 'layers.1.linear2.bias', 'layers.1.norm1.weight', 'layers.1.norm1.bias', 'layers.1.norm2.weight', 'layers.1.norm2.bias', 'layers.1.norm3.weight', 'layers.1.norm3.bias', 'layers.2.self_attn.in_proj_weight', 'layers.2.self_attn.in_proj_bias', 'layers.2.self_attn.out_proj.weight', 'layers.2.self_attn.out_proj.bias', 'layers.2.multihead_attn.in_proj_weight', 'layers.2.multihead_attn.in_proj_bias', 'layers.2.multihead_attn.out_proj.weight', 'layers.2.multihead_attn.out_proj.bias', 'layers.2.linear1.weight', 'layers.2.linear1.bias', 'layers.2.linear2.weight', 'layers.2.linear2.bias', 'layers.2.norm1.weight', 'layers.2.norm1.bias', 'layers.2.norm2.weight', 'layers.2.norm2.bias', 'layers.2.norm3.weight', 'layers.2.norm3.bias', 'layers.3.self_attn.in_proj_weight', 'layers.3.self_attn.in_proj_bias', 'layers.3.self_attn.out_proj.weight', 'layers.3.self_attn.out_proj.bias', 'layers.3.multihead_attn.in_proj_weight', 'layers.3.multihead_attn.in_proj_bias', 'layers.3.multihead_attn.out_proj.weight', 'layers.3.multihead_attn.out_proj.bias', 'layers.3.linear1.weight', 'layers.3.linear1.bias', 'layers.3.linear2.weight', 'layers.3.linear2.bias', 'layers.3.norm1.weight', 'layers.3.norm1.bias', 'layers.3.norm2.weight', 'layers.3.norm2.bias', 'layers.3.norm3.weight', 'layers.3.norm3.bias']
+    assert_equal Set.new(decoder.state_dict.keys), Set.new(expected_keys)
+    
+    out = decoder.(tgt, memory).detach
+
+    expected_out = Torch.tensor([
+      [[ 0.9910, -1.6614,  0.4585,  1.1229, -0.8866, -0.0244],
+			 [ 0.2247, -0.9688,  0.4191,  1.8912, -0.9096, -0.6565]],
+
+			[[-0.0579, -0.8439,  1.1724,  0.8325,  0.5904, -1.6936],
+			 [ 0.7203, -0.9428,  1.3076,  0.3839,  0.1755, -1.6445]],
+
+			[[ 1.1308, -1.1648,  0.9485,  0.5929, -0.0547, -1.4527],
+			 [-0.2060, -1.2025,  0.2268,  1.5961,  0.7484, -1.1629]],
+
+			[[-0.2963, -0.6104,  1.0706,  1.4588, -0.1225, -1.5001],
+			 [ 0.8797, -1.1604,  0.9647,  0.8675, -0.0712, -1.4803]]
+    ])
+
+    assert_equal out.shape, expected_out.shape
+    assert (expected_out - out).abs.lt(1e-6).all
+  end
+
+  def test_entire_transformer
+    Torch.manual_seed SEED
+    src = Torch.randn([S, B, E])
+    tgt = Torch.randn(T, B, E)
+
+    tf = Torch::NN::Transformer.new(d_model: E, nhead: NHEAD)
+    out = tf.(src, tgt).detach
+
+    expected_out = Torch.tensor([
+      [[ 1.3946,  1.0311, -0.4112, -1.4705, -0.7782,  0.2342],
+			 [ 1.3813,  0.7335,  0.4295, -1.7469, -0.3987, -0.3987]],
+
+			[[ 0.8528,  0.2527,  1.0666, -1.0627,  0.5239, -1.6332],
+			 [ 1.0099,  0.6658,  1.2135, -1.2414, -1.1116, -0.5361]],
+
+			[[ 0.7495,  0.7391,  1.1455, -1.5647, -0.0059, -1.0636],
+			 [ 0.6769, -0.6463,  1.1300, -0.6820,  1.0389, -1.5175]],
+
+			[[ 1.0712,  0.8934,  0.2774, -1.7420,  0.3894, -0.8894],
+			 [ 0.9592,  0.6803,  1.0008, -1.6594, -0.0541, -0.9268]]
+    ])
+
+
+    assert_equal out.shape, expected_out.shape
+    assert (expected_out - out).abs.lt(1e-6).all
+  end
+end

From 512962c961342045a26a3877fce4d719a6d11a57 Mon Sep 17 00:00:00 2001
From: Yvan <team@orlando-labs.com>
Date: Thu, 30 Sep 2021 21:12:49 +0300
Subject: [PATCH 16/28] attribute readers for ConvNd

It is often useful to access convolutional layer attributes, e.g. for output shapes precalculation.
---
 lib/torch/nn/convnd.rb | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/torch/nn/convnd.rb b/lib/torch/nn/convnd.rb
index cb300cb4..86289d1b 100644
--- a/lib/torch/nn/convnd.rb
+++ b/lib/torch/nn/convnd.rb
@@ -1,6 +1,8 @@
 module Torch
   module NN
     class ConvNd < Module
+      attr_reader :in_channels, :out_channels, :kernel_size, :stride, :padding, :dilation, :transposed, :output_paddding, :groups, :padding_mode
+      
       def initialize(in_channels, out_channels, kernel_size, stride, padding, dilation, transposed, output_padding, groups, bias, padding_mode)
         super()
         raise ArgumentError, "in_channels must be divisible by groups" if in_channels % groups != 0

From 480baa8e3e8687636b706e5060cf8539b3e591f4 Mon Sep 17 00:00:00 2001
From: Yvan <team@orlando-labs.com>
Date: Thu, 30 Sep 2021 23:54:37 +0300
Subject: [PATCH 17/28] Fixed generation of square subsequent mask

---
 lib/torch/nn/transformer.rb | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/torch/nn/transformer.rb b/lib/torch/nn/transformer.rb
index c19b1e26..2d9912cb 100644
--- a/lib/torch/nn/transformer.rb
+++ b/lib/torch/nn/transformer.rb
@@ -87,7 +87,6 @@ class << self
         def generate_square_subsequent_mask(sz)
           mask = Torch.triu(Torch.ones([sz, sz])).eq(1).transpose(0, 1)
           mask.float.masked_fill!(mask.eq(0), -Float::INFINITY).masked_fill!(mask.eq(1), 0.0)
-          mask
         end
 
         alias :square_subsequent_mask :generate_square_subsequent_mask

From 64be5d305b80653c53c33b556fce125cf016f98a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i@orlando-labs.com>
Date: Fri, 14 Nov 2025 17:30:42 +0300
Subject: [PATCH 18/28] distributed data parallel port with the supporting
 features: Torch#load:map_locaion, torchrun, Tests are provided

---
 README.md                                     |  28 +
 bin/torchrun                                  |   6 +
 examples/mnist/distributed.rb                 | 198 +++++++
 ext/torch/accelerator.cpp                     |  52 ++
 ext/torch/distributed.cpp                     | 271 +++++++++
 ext/torch/ext.cpp                             |   4 +
 ext/torch/tensor.cpp                          | 110 ++++
 lib/torch.rb                                  | 118 +++-
 lib/torch/accelerator.rb                      |  20 +
 lib/torch/distributed.rb                      | 195 +++++++
 .../nn/parallel/distributed_data_parallel.rb  | 101 ++++
 lib/torch/torchrun.rb                         | 512 ++++++++++++++++++
 test/distributed_test.rb                      |  77 +++
 test/save_test.rb                             |  55 ++
 test/support/scripts/show_ranks.rb            |   7 +
 test/torchrun_test.rb                         |  33 ++
 torch-rb.gemspec                              |   4 +-
 17 files changed, 1788 insertions(+), 3 deletions(-)
 create mode 100755 bin/torchrun
 create mode 100644 examples/mnist/distributed.rb
 create mode 100644 ext/torch/accelerator.cpp
 create mode 100644 ext/torch/distributed.cpp
 create mode 100644 lib/torch/accelerator.rb
 create mode 100644 lib/torch/distributed.rb
 create mode 100644 lib/torch/nn/parallel/distributed_data_parallel.rb
 create mode 100644 lib/torch/torchrun.rb
 create mode 100644 test/distributed_test.rb
 create mode 100644 test/support/scripts/show_ranks.rb
 create mode 100644 test/torchrun_test.rb

diff --git a/README.md b/README.md
index a49c7e3f..d477c8ca 100644
--- a/README.md
+++ b/README.md
@@ -55,9 +55,35 @@ A good place to start is [Deep Learning with Torch.rb: A 60 Minute Blitz](tutori
 ## Examples
 
 - [Image classification with MNIST](examples/mnist) ([日本語版](https://qiita.com/kojix2/items/c19c36dc1bf73ea93409))
+- [Distributed MNIST training](examples/mnist/distributed.rb)
 - [Collaborative filtering with MovieLens](examples/movielens)
 - [Generative adversarial networks](examples/gan)
 
+## Distributed Training
+
+Torch.rb ships with a `torchrun` launcher that mirrors the PyTorch CLI. It handles process orchestration and sets the `RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, and `MASTER_PORT` environment variables expected by `Torch::Distributed.init_process_group`.
+
+Start a single-node job with a process per GPU (or CPU) with:
+
+```sh
+bundle exec torchrun --standalone --nproc-per-node=gpu path/to/training_script.rb --script-arg value
+```
+
+For multi-node runs, launch the same command on every node with matching rendezvous settings:
+
+```sh
+bundle exec torchrun \
+  --nnodes=2 \
+  --node-rank=0 \
+  --rdzv-backend=c10d \
+  --rdzv-endpoint=host0.example.com:29503 \
+  --rdzv-id=my-job \
+  --nproc-per-node=4 \
+  path/to/training_script.rb
+```
+
+On node 1, change `--node-rank=1`. The launcher restarts workers up to `--max-restarts` times and can be combined with tools like `bundle exec` or custom scripts via `--no-ruby`.
+
 ## API
 
 This library follows the [PyTorch API](https://pytorch.org/docs/stable/torch.html). There are a few changes to make it more Ruby-like:
@@ -329,6 +355,8 @@ net.load_state_dict(Torch.load("net.pth"))
 net.eval
 ```
 
+`Torch.load` mirrors the Python API and accepts `map_location` and `weights_only` keyword arguments for compatibility with existing PyTorch checkpoints.
+
 When saving a model in Python to load in Ruby, convert parameters to tensors (due to outstanding bugs in LibTorch)
 
 ```python
diff --git a/bin/torchrun b/bin/torchrun
new file mode 100755
index 00000000..d698ade2
--- /dev/null
+++ b/bin/torchrun
@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require_relative "../lib/torch/torchrun"
+
+Torch::TorchRun.start(ARGV)
diff --git a/examples/mnist/distributed.rb b/examples/mnist/distributed.rb
new file mode 100644
index 00000000..531b0f87
--- /dev/null
+++ b/examples/mnist/distributed.rb
@@ -0,0 +1,198 @@
+# Distributed MNIST training with Torch::Distributed + DistributedDataParallel
+# Run with: ruby examples/mnist/distributed.rb --gpus 2
+
+require "bundler/setup"
+require "optparse"
+require "torch"
+require "torchvision"
+require "socket"
+
+unless Torch::Distributed.available?
+  abort "torch.distributed was not built in this binary"
+end
+
+class MyNet < Torch::NN::Module
+  def initialize
+    super()
+    @conv1 = Torch::NN::Conv2d.new(1, 32, 3, stride: 1)
+    @conv2 = Torch::NN::Conv2d.new(32, 64, 3, stride: 1)
+    @dropout1 = Torch::NN::Dropout2d.new(p: 0.25)
+    @dropout2 = Torch::NN::Dropout2d.new(p: 0.5)
+    @fc1 = Torch::NN::Linear.new(9216, 128)
+    @fc2 = Torch::NN::Linear.new(128, 10)
+  end
+
+  def forward(x)
+    x = Torch::NN::F.relu(@conv1.call(x))
+    x = Torch::NN::F.relu(@conv2.call(x))
+    x = Torch::NN::F.max_pool2d(x, 2)
+    x = @dropout1.call(x)
+    x = Torch.flatten(x, start_dim: 1)
+    x = Torch::NN::F.relu(@fc1.call(x))
+    x = @dropout2.call(x)
+    Torch::NN::F.log_softmax(@fc2.call(x), 1)
+  end
+end
+
+def parse_options
+  defaults = {
+    epochs: 5,
+    batch_size: 64,
+    lr: 1.0,
+    gamma: 0.7,
+    backend: "gloo",
+    gpus: Torch::CUDA.available? ? [Torch::CUDA.device_count, 1].max : 1,
+    log_interval: 20,
+    data_dir: File.join(__dir__, "data")
+  }
+
+  OptionParser.new do |opts|
+    opts.banner = "Usage: ruby distributed.rb [options]"
+    opts.on("--epochs N", Integer, "Number of epochs (default: #{defaults[:epochs]})") { |v| defaults[:epochs] = v }
+    opts.on("--batch-size N", Integer, "Batch size per process (default: #{defaults[:batch_size]})") { |v| defaults[:batch_size] = v }
+    opts.on("--lr FLOAT", Float, "Learning rate (default: #{defaults[:lr]})") { |v| defaults[:lr] = v }
+    opts.on("--gamma FLOAT", Float, "LR scheduler gamma (default: #{defaults[:gamma]})") { |v| defaults[:gamma] = v }
+    opts.on("--backend NAME", String, "Process group backend (default: #{defaults[:backend]})") { |v| defaults[:backend] = v }
+    opts.on("--gpus N", Integer, "Number of GPUs/processes to use") { |v| defaults[:gpus] = v }
+    opts.on("--log-interval N", Integer, "Batches between log statements") { |v| defaults[:log_interval] = v }
+    opts.on("--data-dir PATH", String, "Directory for cached MNIST data") { |v| defaults[:data_dir] = v }
+  end.parse!(ARGV)
+
+  defaults
+end
+
+def free_port
+  server = TCPServer.new("127.0.0.1", 0)
+  port = server.addr[1]
+  server.close
+  port
+end
+
+def spawn_workers(world_size)
+  port = free_port
+
+  world_size.times.map do |rank|
+    fork do
+      yield(rank, world_size, port)
+    end
+  end.each { Process.wait2(_1) }
+end
+
+def load_datasets(rank, data_dir)
+  transforms = TorchVision::Transforms::Compose.new([
+    TorchVision::Transforms::ToTensor.new,
+    TorchVision::Transforms::Normalize.new([0.1307], [0.3081])
+  ])
+
+  if rank.zero?
+    train = TorchVision::Datasets::MNIST.new(data_dir, train: true, download: true, transform: transforms)
+    test = TorchVision::Datasets::MNIST.new(data_dir, train: false, download: true, transform: transforms)
+    Torch::Distributed.barrier
+  else
+    Torch::Distributed.barrier
+    train = TorchVision::Datasets::MNIST.new(data_dir, train: true, download: false, transform: transforms)
+    test = TorchVision::Datasets::MNIST.new(data_dir, train: false, download: false, transform: transforms)
+  end
+
+  [train, test]
+end
+
+def subset_for_rank(dataset, rank, world_size)
+  indices = rank.step(dataset.size - 1, world_size).to_a
+  Torch::Utils::Data::Subset.new(dataset, indices)
+end
+
+def train_epoch(model, device, loader, optimizer, epoch, rank, log_interval)
+  model.train
+  loader.each_with_index do |(data, target), batch_idx|
+    data = data.to(device)
+    target = target.to(device)
+
+    optimizer.zero_grad
+    loss = Torch::NN::F.nll_loss(model.call(data), target)
+    loss.backward
+    optimizer.step
+
+    next unless rank.zero? && (batch_idx % log_interval).zero?
+
+    processed = batch_idx * data.size(0)
+    total = loader.dataset.size
+    percent = 100.0 * processed / total
+    puts "Rank #{rank} | Epoch #{epoch} [#{processed}/#{total} (#{percent.round})%] Loss: #{'%.4f' % loss.item}"
+  end
+end
+
+def evaluate(model, device, loader)
+  model.eval
+  loss = 0.0
+  correct = 0
+  Torch.no_grad do
+    loader.each do |data, target|
+      data = data.to(device)
+      target = target.to(device)
+      output = model.call(data)
+      loss += Torch::NN::F.nll_loss(output, target, reduction: "sum").item
+      pred = output.argmax(1, keepdim: true)
+      correct += pred.eq(target.view_as(pred)).sum.item
+    end
+  end
+
+  loss /= loader.dataset.size
+  acc = 100.0 * correct / loader.dataset.size
+  puts "Test set: Average loss: #{format('%.4f', loss)}, Accuracy: #{correct}/#{loader.dataset.size} (#{format('%.1f', acc)}%)"
+end
+
+def run_worker(rank, world_size, port, options)
+  store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?)
+  accelerator = Torch::Accelerator.current_accelerator
+  backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator)
+  Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size)
+
+  device = if Torch::CUDA.available? && options[:gpus] > 0
+    Torch.device("cuda:#{rank % Torch::CUDA.device_count}")
+  else
+    Torch.device("cpu")
+  end
+
+  model = MyNet.new.to(device)
+  ddp = Torch::NN::Parallel::DistributedDataParallel.new(model, device_ids: device.type == "cuda" ? [device.index] : nil)
+  optimizer = Torch::Optim::Adadelta.new(ddp.module.parameters, lr: options[:lr])
+  scheduler = Torch::Optim::LRScheduler::StepLR.new(optimizer, step_size: 1, gamma: options[:gamma])
+
+  train_dataset, test_dataset = load_datasets(rank, options[:data_dir])
+  train_subset = subset_for_rank(train_dataset, rank, world_size)
+  train_loader = Torch::Utils::Data::DataLoader.new(train_subset, batch_size: options[:batch_size], shuffle: true)
+  test_loader = Torch::Utils::Data::DataLoader.new(test_dataset, batch_size: options[:batch_size], shuffle: false) if rank.zero?
+
+  options[:epochs].times do |epoch_idx|
+    epoch = epoch_idx + 1
+    train_epoch(ddp, device, train_loader, optimizer, epoch, rank, options[:log_interval])
+    if rank.zero?
+      evaluate(ddp.module, device, test_loader)
+    end
+  end
+
+  Torch::Distributed.destroy_process_group
+end
+
+options = parse_options
+world_size = options[:gpus]
+raise "Number of GPUs requested must be >= 1" if world_size < 1
+if Torch::CUDA.available?
+  max_devices = Torch::CUDA.device_count
+  if world_size > max_devices
+    raise "Requested #{world_size} GPUs but only #{max_devices} visible"
+  end
+else
+  puts "CUDA not available, running #{world_size} CPU workers"
+end
+
+Torch.manual_seed(1)
+
+if world_size == 1
+  run_worker(0, 1, free_port, options)
+else
+  spawn_workers(world_size) do |rank, total, port|
+    run_worker(rank, total, port, options)
+  end
+end
diff --git a/ext/torch/accelerator.cpp b/ext/torch/accelerator.cpp
new file mode 100644
index 00000000..45cfcb41
--- /dev/null
+++ b/ext/torch/accelerator.cpp
@@ -0,0 +1,52 @@
+#include <ATen/Context.h>
+#include <ATen/DeviceAccelerator.h>
+#include <torch/torch.h>
+
+#include <rice/rice.hpp>
+
+#include "utils.h"
+
+namespace {
+
+inline bool accelerator_available(c10::DeviceType device_type) {
+  return at::globalContext()
+      .getAcceleratorHooksInterface(device_type)
+      .isAvailable();
+}
+
+} // namespace
+
+void init_accelerator(Rice::Module& m) {
+  auto rb_mAccelerator = Rice::define_module_under(m, "Accelerator");
+
+  rb_mAccelerator.define_singleton_function(
+      "_current_device",
+      []() -> VALUE {
+        auto acc = at::getAccelerator(false);
+        if (!acc.has_value()) {
+          return Rice::Nil;
+        }
+        torch::Device device(acc.value());
+        return Rice::detail::To_Ruby<torch::Device>().convert(device);
+      });
+
+  rb_mAccelerator.define_singleton_function(
+      "_is_available",
+      []() {
+        auto acc = at::getAccelerator(false);
+        if (!acc.has_value()) {
+          return false;
+        }
+        return accelerator_available(acc.value());
+      });
+
+  rb_mAccelerator.define_singleton_function(
+      "_device_count",
+      []() {
+        auto acc = at::getAccelerator(false);
+        if (!acc.has_value()) {
+          return 0;
+        }
+        return static_cast<int>(at::accelerator::deviceCount());
+      });
+}
diff --git a/ext/torch/distributed.cpp b/ext/torch/distributed.cpp
new file mode 100644
index 00000000..de5f7c9a
--- /dev/null
+++ b/ext/torch/distributed.cpp
@@ -0,0 +1,271 @@
+#include <algorithm>
+#include <chrono>
+#include <cctype>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <torch/torch.h>
+
+#include <rice/rice.hpp>
+#include <rice/stl.hpp>
+
+#include "utils.h"
+
+#ifdef USE_C10D
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
+#include <torch/csrc/distributed/c10d/TCPStore.hpp>
+#include <torch/csrc/distributed/c10d/FileStore.hpp>
+#include <torch/csrc/distributed/c10d/Work.hpp>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#endif
+
+#if defined(USE_C10D) && defined(USE_C10D_NCCL)
+#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
+#endif
+
+#if defined(USE_C10D) && !defined(_WIN32)
+#include <torch/csrc/distributed/c10d/HashStore.hpp>
+#endif
+
+namespace {
+
+#ifdef USE_C10D
+
+using StorePtr = c10::intrusive_ptr<::c10d::Store>;
+using ProcessGroupPtr = c10::intrusive_ptr<::c10d::ProcessGroup>;
+
+struct StoreWrapper {
+  StoreWrapper() = default;
+  explicit StoreWrapper(StorePtr store) : store_(std::move(store)) {}
+
+  StorePtr store_;
+};
+
+struct ProcessGroupWrapper {
+  ProcessGroupWrapper() = default;
+  explicit ProcessGroupWrapper(ProcessGroupPtr pg) : pg_(std::move(pg)) {}
+
+  ProcessGroupPtr pg_;
+};
+
+ProcessGroupPtr default_process_group;
+
+ProcessGroupPtr resolve_process_group(Rice::Object pg_obj) {
+  if (pg_obj.is_nil()) {
+    if (!default_process_group) {
+      rb_raise(rb_eRuntimeError, "Distributed process group not initialized");
+    }
+    return default_process_group;
+  }
+  auto& wrapper = Rice::detail::From_Ruby<ProcessGroupWrapper&>().convert(pg_obj.value());
+  if (!wrapper.pg_) {
+    rb_raise(rb_eRuntimeError, "Invalid process group");
+  }
+  return wrapper.pg_;
+}
+
+int reduce_op_from_int(int code) {
+  if (code < 0 || code > static_cast<int>(::c10d::ReduceOp::UNUSED)) {
+    rb_raise(rb_eArgError, "Unknown reduce op code");
+  }
+  return code;
+}
+
+#endif
+
+} // namespace
+
+void init_distributed(Rice::Module& m) {
+  auto rb_mDistributed = Rice::define_module_under(m, "Distributed");
+#ifdef USE_C10D
+  rb_mDistributed.define_singleton_function("available?", []() { return true; });
+
+  auto rb_cStore = Rice::define_class_under<StoreWrapper>(rb_mDistributed, "Store");
+  rb_cStore.define_method(
+      "_native?",
+      [](StoreWrapper& self) {
+        return static_cast<bool>(self.store_);
+      });
+
+  auto rb_cProcessGroup = Rice::define_class_under<ProcessGroupWrapper>(rb_mDistributed, "ProcessGroup")
+    .define_method(
+      "rank",
+      [](ProcessGroupWrapper& self) {
+        return self.pg_ ? self.pg_->getRank() : -1;
+      })
+    .define_method(
+      "size",
+      [](ProcessGroupWrapper& self) {
+        return self.pg_ ? self.pg_->getSize() : 0;
+      })
+    .define_method(
+      "backend",
+      [](ProcessGroupWrapper& self) {
+        if (!self.pg_) {
+          return std::string();
+        }
+        return self.pg_->getBackendName();
+      });
+
+  rb_mDistributed.define_singleton_function(
+      "_create_tcp_store",
+      [rb_cStore](const std::string& host,
+                  int port,
+                  int world_size,
+                  bool is_master,
+                  int64_t timeout_millis,
+                  bool wait_for_workers) {
+        ::c10d::TCPStoreOptions opts;
+        opts.port = static_cast<uint16_t>(port);
+        opts.isServer = is_master;
+        opts.numWorkers = world_size;
+        opts.waitWorkers = wait_for_workers;
+        opts.timeout = std::chrono::milliseconds(timeout_millis);
+        auto store = c10::make_intrusive<::c10d::TCPStore>(host, opts);
+        return Rice::Data_Object<StoreWrapper>(new StoreWrapper(store), rb_cStore, true);
+      });
+
+  rb_mDistributed.define_singleton_function(
+      "_create_file_store",
+      [rb_cStore](const std::string& path, int world_size) {
+        auto store = c10::make_intrusive<::c10d::FileStore>(path, world_size);
+        return Rice::Data_Object<StoreWrapper>(new StoreWrapper(store), rb_cStore, true);
+      });
+
+#if !defined(_WIN32)
+  rb_mDistributed.define_singleton_function(
+      "_create_hash_store",
+      [rb_cStore]() {
+        auto store = c10::make_intrusive<::c10d::HashStore>();
+        return Rice::Data_Object<StoreWrapper>(new StoreWrapper(store), rb_cStore, true);
+      });
+#endif
+
+  rb_mDistributed.define_singleton_function(
+      "_init_process_group",
+      [rb_cProcessGroup](const std::string& backend,
+                         StoreWrapper& store_wrapper,
+                         int rank,
+                         int world_size,
+                         int64_t timeout_millis) {
+        StorePtr store = store_wrapper.store_;
+        if (!store) {
+          rb_raise(rb_eArgError, "Store is required for init_process_group");
+        }
+
+        std::string backend_lower = backend;
+        std::transform(backend_lower.begin(), backend_lower.end(), backend_lower.begin(), ::tolower);
+
+        ProcessGroupPtr pg;
+        if (backend_lower == "gloo") {
+#ifdef USE_C10D_GLOO
+          auto options = ::c10d::ProcessGroupGloo::Options::create();
+          options->timeout = std::chrono::milliseconds(timeout_millis);
+          options->devices.push_back(::c10d::ProcessGroupGloo::createDefaultDevice());
+          pg = c10::make_intrusive<::c10d::ProcessGroupGloo>(store, rank, world_size, options);
+#else
+          rb_raise(rb_eRuntimeError, "Gloo backend is not available in this build");
+#endif
+        } else if (backend_lower == "nccl") {
+#if defined(USE_C10D_NCCL)
+          auto options = c10::make_intrusive<::c10d::ProcessGroupNCCL::Options>();
+          pg = c10::make_intrusive<::c10d::ProcessGroupNCCL>(store, rank, world_size, options);
+#else
+          rb_raise(rb_eRuntimeError, "NCCL backend is not available in this build");
+#endif
+        } else {
+          rb_raise(rb_eArgError, "Unsupported backend: %s", backend.c_str());
+        }
+
+        default_process_group = pg;
+        return Rice::Data_Object<ProcessGroupWrapper>(new ProcessGroupWrapper(pg), rb_cProcessGroup, true);
+      });
+
+  rb_mDistributed.define_singleton_function(
+      "_destroy_process_group",
+      []() {
+        default_process_group.reset();
+        return Rice::Nil;
+      });
+
+  rb_mDistributed.define_singleton_function(
+      "_initialized?",
+      []() {
+        return static_cast<bool>(default_process_group);
+      });
+
+  rb_mDistributed.define_singleton_function(
+      "_default_process_group",
+      [rb_cProcessGroup]() {
+        if (!default_process_group) {
+          return Rice::Nil;
+        }
+        return Rice::Data_Object<ProcessGroupWrapper>(new ProcessGroupWrapper(default_process_group), rb_cProcessGroup, true);
+      });
+
+  rb_mDistributed.define_singleton_function(
+      "_get_world_size",
+      [](Rice::Object pg_obj) {
+        auto pg = resolve_process_group(pg_obj);
+        return pg->getSize();
+      });
+
+  rb_mDistributed.define_singleton_function(
+      "_get_rank",
+      [](Rice::Object pg_obj) {
+        auto pg = resolve_process_group(pg_obj);
+        return pg->getRank();
+      });
+
+  rb_mDistributed.define_singleton_function(
+      "_barrier",
+      [](Rice::Object pg_obj) {
+        auto pg = resolve_process_group(pg_obj);
+        ::c10d::BarrierOptions opts;
+        auto work = pg->barrier(opts);
+        work->wait();
+        return Rice::Nil;
+      });
+
+  rb_mDistributed.define_singleton_function(
+      "_all_reduce",
+      [](torch::Tensor& tensor, int op_code, Rice::Object pg_obj) {
+        auto pg = resolve_process_group(pg_obj);
+        ::c10d::AllreduceOptions opts;
+        opts.reduceOp = ::c10d::ReduceOp(static_cast<::c10d::ReduceOp::RedOpType>(reduce_op_from_int(op_code)));
+        std::vector<at::Tensor> tensors{tensor};
+        auto work = pg->allreduce(tensors, opts);
+        work->wait();
+        return tensor;
+      });
+
+  rb_mDistributed.define_singleton_function(
+      "_broadcast",
+      [](torch::Tensor& tensor, int src, Rice::Object pg_obj) {
+        auto pg = resolve_process_group(pg_obj);
+        ::c10d::BroadcastOptions opts;
+        opts.rootRank = src;
+        std::vector<at::Tensor> tensors{tensor};
+        auto work = pg->broadcast(tensors, opts);
+        work->wait();
+        return tensor;
+      });
+
+  auto rb_mReduceOp = Rice::define_module_under(rb_mDistributed, "ReduceOp");
+  rb_mReduceOp.const_set("SUM", INT2NUM(static_cast<int>(::c10d::ReduceOp::SUM)));
+  rb_mReduceOp.const_set("AVG", INT2NUM(static_cast<int>(::c10d::ReduceOp::AVG)));
+  rb_mReduceOp.const_set("PRODUCT", INT2NUM(static_cast<int>(::c10d::ReduceOp::PRODUCT)));
+  rb_mReduceOp.const_set("MIN", INT2NUM(static_cast<int>(::c10d::ReduceOp::MIN)));
+  rb_mReduceOp.const_set("MAX", INT2NUM(static_cast<int>(::c10d::ReduceOp::MAX)));
+  rb_mReduceOp.const_set("BAND", INT2NUM(static_cast<int>(::c10d::ReduceOp::BAND)));
+  rb_mReduceOp.const_set("BOR", INT2NUM(static_cast<int>(::c10d::ReduceOp::BOR)));
+  rb_mReduceOp.const_set("BXOR", INT2NUM(static_cast<int>(::c10d::ReduceOp::BXOR)));
+  rb_mReduceOp.const_set("PREMUL_SUM", INT2NUM(static_cast<int>(::c10d::ReduceOp::PREMUL_SUM)));
+
+  rb_mDistributed.const_set("DEFAULT_TIMEOUT", INT2NUM(::c10d::kProcessGroupDefaultTimeout.count() / 1000));
+#else
+  rb_mDistributed.define_singleton_function("available?", []() { return false; });
+#endif
+}
diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp
index eb6fb7d3..dc9cef20 100644
--- a/ext/torch/ext.cpp
+++ b/ext/torch/ext.cpp
@@ -6,6 +6,8 @@ void init_fft(Rice::Module& m);
 void init_linalg(Rice::Module& m);
 void init_nn(Rice::Module& m);
 void init_special(Rice::Module& m);
+void init_accelerator(Rice::Module& m);
+void init_distributed(Rice::Module& m);
 void init_tensor(Rice::Module& m, Rice::Class& c, Rice::Class& rb_cTensorOptions);
 void init_torch(Rice::Module& m);
 
@@ -40,10 +42,12 @@ void Init_ext() {
   init_fft(m);
   init_linalg(m);
   init_special(m);
+  init_accelerator(m);
 
   init_backends(m);
   init_cuda(m);
   init_generator(m, rb_cGenerator);
   init_ivalue(m, rb_cIValue);
   init_random(m);
+  init_distributed(m);
 }
diff --git a/ext/torch/tensor.cpp b/ext/torch/tensor.cpp
index 390b5a9c..c7e003d8 100644
--- a/ext/torch/tensor.cpp
+++ b/ext/torch/tensor.cpp
@@ -1,9 +1,12 @@
+#include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include <torch/torch.h>
 
 #include <rice/rice.hpp>
+#include <ruby/thread.h>
 
 #include "tensor_functions.h"
 #include "ruby_arg_parser.h"
@@ -26,6 +29,103 @@ Array flat_data(Tensor& tensor) {
 }
 
 Rice::Class rb_cTensor;
+Rice::Class rb_cHookHandle;
+
+namespace {
+
+struct RubyTensorHook {
+  explicit RubyTensorHook(VALUE proc) : proc_(proc) {
+    rb_gc_register_address(&proc_);
+  }
+
+  ~RubyTensorHook() {
+    rb_gc_unregister_address(&proc_);
+  }
+
+  at::Tensor call(const at::Tensor& grad) {
+    HookCallData data{proc_, grad};
+    rb_thread_call_with_gvl(&RubyTensorHook::invoke, &data);
+    if (data.return_value_defined) {
+      return data.return_tensor;
+    }
+    return grad;
+  }
+
+ private:
+  struct HookCallData {
+    VALUE proc;
+    at::Tensor grad;
+    at::Tensor return_tensor;
+    bool return_value_defined = false;
+  };
+
+  static void* invoke(void* arg) {
+    auto* data = reinterpret_cast<HookCallData*>(arg);
+    VALUE grad_obj = Rice::detail::To_Ruby<at::Tensor>().convert(data->grad);
+    VALUE result = rb_funcall(data->proc, rb_intern("call"), 1, grad_obj);
+    if (!NIL_P(result)) {
+      data->return_tensor = Rice::detail::From_Ruby<at::Tensor>().convert(result);
+      data->return_value_defined = true;
+    }
+    return nullptr;
+  }
+
+  VALUE proc_;
+};
+
+class HookHandle {
+ public:
+  HookHandle(const at::Tensor& tensor, unsigned handle, std::shared_ptr<RubyTensorHook> hook)
+      : tensor_(tensor), handle_(handle), hook_(std::move(hook)), removed_(false) {}
+
+  HookHandle(const HookHandle& other) = default;
+  HookHandle& operator=(const HookHandle& other) = default;
+
+  ~HookHandle() {
+    remove();
+  }
+
+  void remove() {
+    if (!removed_) {
+      tensor_.remove_hook(handle_);
+      removed_ = true;
+      hook_.reset();
+    }
+  }
+
+ private:
+  at::Tensor tensor_;
+  unsigned handle_;
+  std::shared_ptr<RubyTensorHook> hook_;
+  bool removed_;
+};
+
+VALUE tensor_register_hook(int argc, VALUE* argv, VALUE self_) {
+  HANDLE_TH_ERRORS
+  VALUE callable = Qnil;
+  rb_scan_args(argc, argv, "01", &callable);
+  if (NIL_P(callable)) {
+    if (rb_block_given_p()) {
+      callable = rb_block_proc();
+    } else {
+      rb_raise(rb_eArgError, "Expected a callable or block");
+    }
+  }
+  if (!rb_respond_to(callable, rb_intern("call"))) {
+    rb_raise(rb_eArgError, "Hook must respond to call");
+  }
+
+  Tensor& self = Rice::detail::From_Ruby<Tensor&>().convert(self_);
+  auto hook = std::make_shared<RubyTensorHook>(callable);
+  unsigned handle = self.register_hook([hook](const at::Tensor& grad) {
+    return hook->call(grad);
+  });
+
+  return Rice::Data_Object<HookHandle>(new HookHandle(self, handle, hook), rb_cHookHandle, true);
+  END_HANDLE_TH_ERRORS
+}
+
+} // namespace
 
 std::vector<TensorIndex> index_vector(Array a) {
   Object obj;
@@ -102,7 +202,17 @@ void init_tensor(Rice::Module& m, Rice::Class& c, Rice::Class& rb_cTensorOptions
   add_tensor_functions(rb_cTensor);
   THPVariableClass = rb_cTensor.value();
 
+  auto rb_mAutograd = Rice::define_module_under(m, "Autograd");
+  rb_cHookHandle = Rice::define_class_under<HookHandle>(rb_mAutograd, "RemovableHandle")
+    .define_method(
+      "remove",
+      [](HookHandle& self) {
+        self.remove();
+        return Rice::Nil;
+      });
+
   rb_define_method(rb_cTensor, "backward", (VALUE (*)(...)) tensor__backward, -1);
+  rb_define_method(rb_cTensor, "register_hook", (VALUE (*)(...)) tensor_register_hook, -1);
 
   rb_cTensor
     .define_method("cuda?", [](Tensor& self) { return self.is_cuda(); })
diff --git a/lib/torch.rb b/lib/torch.rb
index 266c2859..ce213e1d 100644
--- a/lib/torch.rb
+++ b/lib/torch.rb
@@ -9,6 +9,8 @@
 
 # modules
 require_relative "torch/device"
+require_relative "torch/accelerator"
+require_relative "torch/distributed"
 require_relative "torch/inspector"
 require_relative "torch/tensor"
 require_relative "torch/version"
@@ -191,6 +193,7 @@
 require_relative "torch/nn/functional"
 require_relative "torch/nn/functional_attention"
 require_relative "torch/nn/init"
+require_relative "torch/nn/parallel/distributed_data_parallel"
 
 # utils
 require_relative "torch/utils/data"
@@ -399,11 +402,14 @@ def save(obj, f)
       File.binwrite(f, _save(to_ivalue(obj)))
     end
 
-    def load(filename)
+    def load(filename, map_location: nil, weights_only: false)
       # keep backwards compatibility
       File.open(filename, "rb") { |f| f.read(1) }
 
-      to_ruby(_load(filename))
+      result = to_ruby(_load(filename))
+      ensure_weights_only_contents!(result) if weights_only
+      result = apply_map_location(result, map_location) if map_location
+      result
     end
 
     def tensor(data, **options)
@@ -536,6 +542,114 @@ def to_ruby(ivalue)
       end
     end
 
+    WEIGHTS_ONLY_PRIMITIVE_CLASSES =
+      [
+        NilClass,
+        TrueClass,
+        FalseClass,
+        Integer,
+        Float,
+        String
+      ].freeze
+
+    def ensure_weights_only_contents!(obj)
+      case obj
+      when *WEIGHTS_ONLY_PRIMITIVE_CLASSES
+        obj
+      when Tensor
+        obj
+      when Array
+        obj.each { |value| ensure_weights_only_contents!(value) }
+      when Hash
+        obj.each do |key, value|
+          ensure_weights_only_contents!(key)
+          ensure_weights_only_contents!(value)
+        end
+      else
+        raise Error, "weights_only load supports tensors, primitive Ruby types, arrays, and hashes (found #{obj.class.name})"
+      end
+    end
+
+    def apply_map_location(obj, map_location)
+      case obj
+      when Tensor
+        map_tensor_location(obj, map_location)
+      when Array
+        obj.map { |value| apply_map_location(value, map_location) }
+      when Hash
+        obj.each_with_object({}) do |(key, value), memo|
+          memo[apply_map_location(key, map_location)] = apply_map_location(value, map_location)
+        end
+      else
+        obj
+      end
+    end
+
+    def map_tensor_location(tensor, map_location)
+      case map_location
+      when nil
+        tensor
+      when Hash
+        target = lookup_map_location_target(map_location, tensor.device)
+        return tensor if target.nil?
+        map_tensor_location(tensor, target)
+      else
+        return map_tensor_location_callable(tensor, map_location) if map_location.respond_to?(:call)
+        device = normalize_map_location_device(map_location)
+        tensor.to(device)
+      end
+    end
+
+    def map_tensor_location_callable(tensor, callable)
+      mapped = callable.call(tensor, map_location_device_tag(tensor.device))
+      return tensor if mapped.nil?
+      unless mapped.is_a?(Tensor)
+        raise Error, "map_location callable must return a Tensor or nil (got #{mapped.class.name})"
+      end
+      mapped
+    end
+
+    def lookup_map_location_target(mapping, device)
+      key = map_location_device_tag(device)
+      mapping.each do |candidate, value|
+        candidate_key =
+          case candidate
+          when Device
+            map_location_device_tag(candidate)
+          when String, Symbol
+            candidate.to_s
+          else
+            candidate
+          end
+        return value if candidate_key == key
+      end
+      nil
+    end
+
+    def map_location_device_tag(device)
+      case device
+      when Device
+        tag = device.type
+        tag += ":#{device.index}" unless device.index.nil?
+        tag
+      when String, Symbol
+        device.to_s
+      else
+        raise Error, "Unknown device reference: #{device.inspect}"
+      end
+    end
+
+    def normalize_map_location_device(location)
+      case location
+      when Device
+        location
+      when String, Symbol
+        device(location.to_s)
+      else
+        raise Error, "Unsupported map_location: #{location.inspect}"
+      end
+    end
+
     def tensor_size(size)
       size.flatten
     end
diff --git a/lib/torch/accelerator.rb b/lib/torch/accelerator.rb
new file mode 100644
index 00000000..abfd95bb
--- /dev/null
+++ b/lib/torch/accelerator.rb
@@ -0,0 +1,20 @@
+module Torch
+  module Accelerator
+    class << self
+      def current_accelerator(check_available: false)
+        device = _current_device
+        return nil unless device
+        return nil if check_available && !available?
+        device
+      end
+
+      def device_count
+        _device_count
+      end
+
+      def available?
+        _is_available
+      end
+    end
+  end
+end
diff --git a/lib/torch/distributed.rb b/lib/torch/distributed.rb
new file mode 100644
index 00000000..77428b52
--- /dev/null
+++ b/lib/torch/distributed.rb
@@ -0,0 +1,195 @@
+require "socket"
+
+module Torch
+  module Distributed
+    DEFAULT_DEVICE_BACKENDS = {
+      "cpu" => "gloo",
+      "cuda" => "nccl",
+      "xpu" => "xccl",
+      "mps" => "gloo"
+    }.freeze
+
+    class << self
+      def initialized?
+        _initialized?
+      end
+
+      def init_process_group(backend = nil, init_method: "env://", store: nil, rank: nil, world_size: nil, timeout: DEFAULT_TIMEOUT, wait_for_workers: true, device_id: nil)
+        raise Torch::Error, "torch.distributed is not available" unless available?
+
+        backend ||= default_backend_for(device_id)
+
+        if store.nil?
+          case init_method
+          when "env://"
+            rank = Integer(ENV.fetch("RANK")) if rank.nil?
+            world_size = Integer(ENV.fetch("WORLD_SIZE")) if world_size.nil?
+            master_addr = ENV.fetch("MASTER_ADDR", "127.0.0.1")
+            master_port = Integer(ENV.fetch("MASTER_PORT", "29500"))
+            raise ArgumentError, "rank is required" if rank.nil?
+            raise ArgumentError, "world_size is required" if world_size.nil?
+            is_master = rank.zero?
+            store = TCPStore.new(master_addr, master_port, world_size, is_master, wait_for_workers: wait_for_workers, timeout: timeout)
+          else
+            raise ArgumentError, "store is required when using init_method=#{init_method.inspect}"
+          end
+        end
+
+        raise ArgumentError, "rank is required" if rank.nil?
+        raise ArgumentError, "world_size is required" if world_size.nil?
+
+        timeout_ms = (timeout * 1000).to_i
+        _init_process_group(backend, store, rank, world_size, timeout_ms)
+      end
+
+      def destroy_process_group
+        _destroy_process_group
+      end
+
+      def default_process_group
+        _default_process_group
+      end
+
+      def get_world_size(group = nil)
+        ensure_process_group!(group)
+        _get_world_size(group)
+      end
+
+      def get_rank(group = nil)
+        ensure_process_group!(group)
+        _get_rank(group)
+      end
+
+      def barrier(group: nil)
+        ensure_process_group!(group)
+        _barrier(group)
+      end
+
+      def all_reduce(tensor, op: ReduceOp::SUM, group: nil)
+        ensure_process_group!(group)
+        _all_reduce(tensor, op, group)
+      end
+
+      def broadcast(tensor, src:, group: nil)
+        ensure_process_group!(group)
+        _broadcast(tensor, src, group)
+      end
+
+      def get_default_backend_for_device(device)
+        backend = DEFAULT_DEVICE_BACKENDS[device_type_from(device)]
+        raise ArgumentError, "Default backend not registered for device: #{device.inspect}" unless backend
+        backend
+      end
+
+      def fork_world(world_size, host: "127.0.0.1")
+        raise ArgumentError, "world_size must be positive" unless world_size.to_i.positive?
+        raise ArgumentError, "block required" unless block_given?
+
+        port = free_port(host: host)
+        readers = []
+        pids = []
+        world_size.times do |rank|
+          reader, writer = IO.pipe
+          pid = fork do
+            reader.close
+            begin
+              writer.binmode
+              result = yield(rank, port)
+              Marshal.dump(result, writer)
+              exit! 0
+            rescue => e
+              Marshal.dump({error: "#{e.class}: #{e.message}", backtrace: e.backtrace}, writer)
+              exit! 1
+            ensure
+              writer.close unless writer.closed?
+            end
+          end
+          writer.close
+          readers << reader
+          pids << pid
+        end
+
+        outputs = readers.map do |reader|
+          data = Marshal.load(reader)
+          reader.close
+          data
+        end
+
+        statuses = pids.each_with_index.map do |pid, idx|
+          _pid, status = Process.wait2(pid)
+          [idx, pid, status]
+        end
+
+        statuses.each do |idx, pid, status|
+          output = outputs[idx]
+          if !status.success? || (output.is_a?(Hash) && output[:error])
+            message = if output.is_a?(Hash) && output[:error]
+              "Child #{pid} failed: #{output[:error]}\n#{Array(output[:backtrace]).join("\n")}"
+            else
+              "Child #{pid} exited with status #{status.exitstatus}"
+            end
+            raise Torch::Error, message
+          end
+        end
+
+        outputs
+      end
+
+      def free_port(host: "127.0.0.1")
+        server = TCPServer.new(host, 0)
+        port = server.addr[1]
+        server.close
+        port
+      end
+
+      private
+
+      def ensure_process_group!(group)
+        return if group || initialized?
+
+        raise Torch::Error, "Default process group is not initialized"
+      end
+
+      def default_backend_for(device_id)
+        get_default_backend_for_device(device_id)
+      end
+
+      def device_type_from(device)
+        case device
+        when Torch::Device
+          device.type
+        when String
+          Torch.device(device).type
+        when Integer
+          Torch.device("cuda:#{device}").type
+        when NilClass
+          Torch::Accelerator.current_accelerator&.type || "cpu"
+        else
+          Torch.device(device).type
+        end
+      rescue => e
+        raise ArgumentError, "Invalid device #{device.inspect}: #{e.message}"
+      end
+    end
+
+    class TCPStore
+      def self.new(host, port, world_size, is_master, wait_for_workers: true, timeout: DEFAULT_TIMEOUT)
+        Torch::Distributed._create_tcp_store(host, port, world_size, is_master, (timeout * 1000).to_i, wait_for_workers)
+      end
+    end
+
+    class FileStore
+      def self.new(path, world_size)
+        Torch::Distributed._create_file_store(path, world_size)
+      end
+    end
+
+    if respond_to?(:_create_hash_store)
+      class HashStore
+        def self.new
+          Torch::Distributed._create_hash_store
+        end
+      end
+    end
+  end
+end
diff --git a/lib/torch/nn/parallel/distributed_data_parallel.rb b/lib/torch/nn/parallel/distributed_data_parallel.rb
new file mode 100644
index 00000000..87178f3b
--- /dev/null
+++ b/lib/torch/nn/parallel/distributed_data_parallel.rb
@@ -0,0 +1,101 @@
+module Torch
+  module NN
+    module Parallel
+      class DistributedDataParallel < Module
+        attr_reader :module, :process_group
+
+        def initialize(mod, device_ids: nil, process_group: nil, broadcast_buffers: true)
+          super()
+          raise Torch::Error, "torch.distributed is not available" unless Torch::Distributed.available?
+
+          @module = mod
+          @broadcast_buffers = broadcast_buffers
+          @process_group = process_group || Torch::Distributed.default_process_group
+          raise Torch::Error, "Process group must be initialized before using DistributedDataParallel" unless @process_group
+
+          @world_size = Torch::Distributed.get_world_size(@process_group)
+          @rank = Torch::Distributed.get_rank(@process_group)
+          @device = Array(device_ids).compact.first
+          move_to_device(@device) if @device
+
+          synchronize_parameters
+          @hook_handles = register_parameter_hooks
+        end
+
+        def forward(*inputs, **kwargs)
+          outputs = @module.call(*move_inputs(inputs), **move_kwargs(kwargs))
+          broadcast_buffers_if_needed
+          outputs
+        end
+
+        alias_method :call, :forward
+
+        def train(mode = true)
+          @module.train(mode)
+          broadcast_buffers_if_needed
+          self
+        end
+
+        private
+
+        def move_to_device(device)
+          return unless device
+
+          @module.to(device)
+        end
+
+        def move_inputs(inputs)
+          return inputs unless @device
+
+          inputs.map { |value| move_value(value, @device) }
+        end
+
+        def move_kwargs(kwargs)
+          return kwargs unless @device
+
+          kwargs.transform_values { |value| move_value(value, @device) }
+        end
+
+        def move_value(value, device)
+          case value
+          when Torch::Tensor
+            value.to(device)
+          when Array
+            value.map { |v| move_value(v, device) }
+          when Hash
+            value.transform_values { |v| move_value(v, device) }
+          else
+            value
+          end
+        end
+
+        def synchronize_parameters
+          Torch::Distributed.barrier(group: @process_group)
+          @module.parameters.each do |param|
+            Torch::Distributed.broadcast(param, src: 0, group: @process_group)
+          end
+          broadcast_buffers_if_needed
+        end
+
+        def broadcast_buffers_if_needed
+          return unless @broadcast_buffers
+
+          @module.buffers.each do |buffer|
+            Torch::Distributed.broadcast(buffer, src: 0, group: @process_group)
+          end
+        end
+
+        def register_parameter_hooks
+          @module.parameters.filter_map do |param|
+            next unless param.requires_grad?
+
+            param.register_hook do |grad|
+              Torch::Distributed.all_reduce(grad, group: @process_group)
+              grad.div!(@world_size.to_f)
+            end
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/lib/torch/torchrun.rb b/lib/torch/torchrun.rb
new file mode 100644
index 00000000..b5913334
--- /dev/null
+++ b/lib/torch/torchrun.rb
@@ -0,0 +1,512 @@
+# frozen_string_literal: true
+
+require "optparse"
+require "socket"
+require "etc"
+require "securerandom"
+require "rbconfig"
+
+require_relative "../torch"
+
+module Torch
+  module TorchRun
+    SIGNALS = %w[INT TERM QUIT].freeze
+
+    class Error < StandardError; end
+
+    class Parser
+      attr_reader :parser
+
+      def initialize
+        @parser = OptionParser.new
+      end
+
+      def parse(argv)
+        options = default_options
+
+        parser.banner = "Usage: torchrun [options] TRAINING_SCRIPT [script args]"
+        parser.separator ""
+        parser.separator "Launch parameters:"
+
+        parser.on("--nnodes MIN[:MAX]", String, "Number of nodes or range (default: #{options[:nnodes]})") do |value|
+          options[:nnodes] = value
+        end
+
+        parser.on("--nproc-per-node VALUE", String, "Processes per node (int, gpu, cpu, auto). Default: #{options[:nproc_per_node]}") do |value|
+          options[:nproc_per_node] = value
+        end
+
+        parser.on("--node-rank VALUE", Integer, "Rank of the node for multi-node jobs. Default: #{options[:node_rank]}") do |value|
+          options[:node_rank] = value
+        end
+
+        parser.on("--rdzv-backend NAME", String, "Rendezvous backend (static or c10d). Default: #{options[:rdzv_backend]}") do |value|
+          options[:rdzv_backend] = value
+        end
+
+        parser.on("--rdzv-endpoint HOST[:PORT]", String, "Rendezvous endpoint. Default: use --master-addr/--master-port") do |value|
+          options[:rdzv_endpoint] = value
+        end
+
+        parser.on("--rdzv-id ID", String, "User defined job id. Default: #{options[:rdzv_id]}") do |value|
+          options[:rdzv_id] = value
+        end
+
+        parser.on("--rdzv-conf CONF", String, "Additional rendezvous config (k=v,k2=v2)") do |value|
+          options[:rdzv_conf] = parse_kv_pairs(value)
+        end
+
+        parser.on("--standalone", "Start a local rendezvous store on a free port") do
+          options[:standalone] = true
+        end
+
+        parser.on("--max-restarts VALUE", Integer, "Restarts before failing. Default: #{options[:max_restarts]}") do |value|
+          options[:max_restarts] = value
+        end
+
+        parser.on("--monitor-interval SECONDS", Float, "Delay between restart attempts. Default: #{options[:monitor_interval]}") do |value|
+          options[:monitor_interval] = value
+        end
+
+        parser.on("--role NAME", String, "Role for the worker group. Default: #{options[:role]}") do |value|
+          options[:role] = value
+        end
+
+        parser.on("--master-addr HOST", String, "Master address for static rendezvous. Default: #{options[:master_addr]}") do |value|
+          options[:master_addr] = value
+        end
+
+        parser.on("--master-port PORT", Integer, "Master port for static rendezvous. Default: #{options[:master_port]}") do |value|
+          options[:master_port] = value
+        end
+
+        parser.on("--pass-local-rank-arg", "Append --local-rank to the training script invocation") do
+          options[:pass_local_rank_arg] = true
+        end
+
+        parser.on("--no-ruby", "Execute the training script directly instead of `#{RbConfig.ruby}`") do
+          options[:no_ruby] = true
+        end
+
+        parser.on("-h", "--help", "Prints this help") do
+          puts parser
+          exit
+        end
+
+        rest = parser.parse!(argv)
+        raise OptionParser::MissingArgument, "training_script" if rest.empty?
+
+        training_script = rest.shift
+        [options, training_script, rest]
+      end
+
+      def to_s
+        parser.to_s
+      end
+
+      private
+
+      def default_options
+        {
+          nnodes: "1:1",
+          nproc_per_node: "1",
+          node_rank: 0,
+          rdzv_backend: "static",
+          rdzv_endpoint: "",
+          rdzv_id: "none",
+          rdzv_conf: {},
+          standalone: false,
+          max_restarts: 0,
+          monitor_interval: 1.0,
+          role: "default",
+          master_addr: "127.0.0.1",
+          master_port: 29_500,
+          pass_local_rank_arg: false,
+          no_ruby: false
+        }
+      end
+
+      def parse_kv_pairs(value)
+        return {} if value.nil? || value.strip.empty?
+
+        value.split(",").each_with_object({}) do |pair, acc|
+          key, val = pair.split("=", 2)
+          raise OptionParser::InvalidArgument, "Invalid rendezvous config entry: #{pair.inspect}" unless key && val
+
+          acc[key.strip] = val.strip
+        end
+      end
+    end
+
+    module_function
+
+    def start(argv, out: $stdout, err: $stderr)
+      parser = Parser.new
+      options, script, script_args = parser.parse(argv)
+      status = Launcher.new(options, script, script_args, out: out, err: err).run
+      exit(status)
+    rescue OptionParser::ParseError => e
+      err.puts(e.message)
+      err.puts(parser)
+      exit(2)
+    rescue Error => e
+      err.puts("torchrun: #{e.message}")
+      exit(1)
+    end
+
+    class Launcher
+      def initialize(options, script, script_args, out: $stdout, err: $stderr)
+        @options = options
+        @script = script
+        @script_args = script_args
+        @out = out
+        @err = err
+
+        @local_world_size = determine_local_world_size(@options[:nproc_per_node])
+        @min_nodes, @max_nodes = parse_nnodes(@options[:nnodes])
+        @num_nodes = ensure_fixed_nnodes(@min_nodes, @max_nodes)
+        @node_rank = @options[:node_rank]
+        @max_restarts = [@options[:max_restarts], 0].max
+        @monitor_interval = [@options[:monitor_interval], 0.0].max
+        @role = @options[:role]
+        @pass_local_rank_arg = @options[:pass_local_rank_arg]
+        @no_ruby = @options[:no_ruby]
+        validate_node_rank!
+
+        setup_rendezvous!
+      end
+
+      def run
+        restarts = 0
+
+        loop do
+          status = launch_worker_group(restarts)
+          return status if status.zero? || @signal_received
+          return status if restarts >= @max_restarts
+
+          restarts += 1
+          log("Worker group failed (exit #{status}). Restarting #{restarts}/#{@max_restarts} ...")
+          sleep(@monitor_interval) if @monitor_interval.positive?
+        end
+      end
+
+      private
+
+      def launch_worker_group(restart_count)
+        @signal_received = nil
+        @current_pids = spawn_workers(restart_count)
+        handler_state = setup_signal_handlers
+        status = monitor_workers(@current_pids.dup)
+        cleanup_workers(@current_pids)
+        restore_signal_handlers(handler_state)
+        return signal_exit_status if @signal_received
+
+        status
+      ensure
+        @current_pids = []
+      end
+
+      def spawn_workers(restart_count)
+        base_env = base_environment(restart_count)
+        Array.new(@local_world_size) do |local_rank|
+          env = base_env.merge(rank_environment(local_rank))
+          spawn_worker(env, local_rank)
+        end
+      end
+
+      def spawn_worker(env, local_rank)
+        args = command_arguments(local_rank)
+        Process.spawn(env, *args)
+      rescue SystemCallError => e
+        raise Error, "failed to launch worker #{local_rank}: #{e.message}"
+      end
+
+      def command_arguments(local_rank)
+        cmd = []
+        if @no_ruby
+          cmd << @script
+        else
+          cmd << RbConfig.ruby
+          cmd << @script
+        end
+        cmd.concat(@script_args)
+        cmd << "--local-rank=#{local_rank}" if @pass_local_rank_arg
+        cmd
+      end
+
+      def base_environment(restart_count)
+        endpoint = "#{@master_addr}:#{@master_port}"
+        env = {
+          "MASTER_ADDR" => @master_addr,
+          "MASTER_PORT" => @master_port.to_s,
+          "WORLD_SIZE" => world_size.to_s,
+          "LOCAL_WORLD_SIZE" => @local_world_size.to_s,
+          "GROUP_RANK" => @node_rank.to_s,
+          "TORCHRUN_ROLE" => @role,
+          "TORCHRUN_NNODES" => @num_nodes.to_s,
+          "TORCHRUN_NPROC_PER_NODE" => @local_world_size.to_s,
+          "TORCHELASTIC_RUN_ID" => @rdzv_id,
+          "TORCHRUN_RDZV_BACKEND" => @rdzv_backend,
+          "TORCHRUN_RDZV_ENDPOINT" => endpoint,
+          "TORCHELASTIC_RESTART_COUNT" => restart_count.to_s,
+          "TORCHRUN_STANDALONE" => @standalone ? "1" : "0"
+        }
+        unless @rdzv_conf.empty?
+          env["TORCHRUN_RDZV_CONF"] = @rdzv_conf.map { |k, v| "#{k}=#{v}" }.join(",")
+        end
+        ENV.to_h.merge(env)
+      end
+
+      def rank_environment(local_rank)
+        rank = @node_rank * @local_world_size + local_rank
+        {
+          "LOCAL_RANK" => local_rank.to_s,
+          "RANK" => rank.to_s
+        }
+      end
+
+      def monitor_workers(pids)
+        exit_code = 0
+        remaining = pids.dup
+        until remaining.empty?
+          pid, status = Process.wait2
+          next unless pid
+
+          remaining.delete(pid)
+          unless status.success?
+            exit_code = exit_status_from(status)
+            terminate_workers(remaining)
+            break
+          end
+        end
+        exit_code
+      rescue Errno::ECHILD
+        0
+      end
+
+      def terminate_workers(pids)
+        return if pids.empty?
+
+        pids.each { |pid| send_signal(pid, "TERM") }
+        sleep(0.2)
+        pids.each do |pid|
+          next unless process_alive?(pid)
+
+          send_signal(pid, "KILL")
+        end
+        pids.each do |pid|
+          begin
+            Process.wait(pid)
+          rescue Errno::ECHILD
+          end
+        end
+      end
+
+      def process_alive?(pid)
+        Process.kill(0, pid)
+        true
+      rescue Errno::ESRCH
+        false
+      end
+
+      def setup_signal_handlers
+        SIGNALS.each_with_object({}) do |sig, acc|
+          next unless Signal.list.key?(sig)
+
+          previous = Signal.trap(sig) do
+            @signal_received = sig
+            forward_signal(sig)
+          end
+          acc[sig] = previous
+        end
+      end
+
+      def forward_signal(sig)
+        (@current_pids || []).each { |pid| send_signal(pid, sig) }
+      end
+
+      def restore_signal_handlers(state)
+        return unless state
+
+        state.each do |sig, previous|
+          Signal.trap(sig, previous)
+        end
+      end
+
+      def send_signal(pid, sig)
+        Process.kill(sig, pid)
+      rescue Errno::ESRCH
+        nil
+      end
+
+      def cleanup_workers(pids)
+        pids.each do |pid|
+          next unless process_alive?(pid)
+
+          begin
+            Process.wait(pid)
+          rescue Errno::ECHILD
+          end
+        end
+      end
+
+      def signal_exit_status
+        return 0 unless @signal_received
+
+        128 + Signal.list.fetch(@signal_received, 0)
+      end
+
+      def exit_status_from(status)
+        if status.exited?
+          status.exitstatus
+        elsif status.signaled?
+          128 + status.termsig
+        else
+          1
+        end
+      end
+
+      def determine_local_world_size(value)
+        spec = value.to_s.strip.downcase
+        case spec
+        when "", "1"
+          1
+        when /\A\d+\z/
+          amount = spec.to_i
+          raise Error, "nproc-per-node must be >= 1" if amount < 1
+
+          amount
+        when "gpu"
+          gpu_count = cuda_device_count
+          raise Error, "CUDA is not available for --nproc-per-node=gpu" if gpu_count.zero?
+
+          gpu_count
+        when "auto"
+          gpu_count = cuda_device_count
+          return gpu_count if gpu_count.positive?
+
+          cpu_count
+        when "cpu"
+          cpu_count
+        else
+          raise Error, "Unsupported --nproc-per-node value: #{value}"
+        end
+      end
+
+      def cuda_device_count
+        return 0 unless defined?(Torch::CUDA)
+        return 0 unless Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available?
+        return 0 unless Torch::CUDA.respond_to?(:device_count)
+
+        Torch::CUDA.device_count
+      rescue StandardError
+        0
+      end
+
+      def cpu_count
+        Etc.respond_to?(:nprocessors) ? (Etc.nprocessors || 1) : 1
+      rescue StandardError
+        1
+      end
+
+      def parse_nnodes(value)
+        parts = value.split(":")
+        nums = parts.map do |part|
+          Integer(part, exception: false)
+        end
+        raise Error, "Invalid --nnodes value: #{value.inspect}" if nums.any?(&:nil?)
+
+        if nums.length == 1
+          [nums.first, nums.first]
+        elsif nums.length == 2
+          [nums.first, nums.last]
+        else
+          raise Error, "Invalid --nnodes value: #{value.inspect}"
+        end
+      end
+
+      def ensure_fixed_nnodes(min_nodes, max_nodes)
+        raise Error, "--nnodes minimum must be >= 1" if min_nodes < 1
+        raise Error, "--nnodes maximum must be >= minimum" if max_nodes < min_nodes
+        raise Error, "Elastic nnodes ranges are not supported yet (got #{min_nodes}:#{max_nodes})" if min_nodes != max_nodes
+
+        min_nodes
+      end
+
+      def world_size
+        @world_size ||= @num_nodes * @local_world_size
+      end
+
+      def validate_node_rank!
+        raise Error, "--node-rank must be >= 0" if @node_rank.negative?
+        raise Error, "--node-rank (#{@node_rank}) must be less than --nnodes (#{@num_nodes})" if @node_rank >= @num_nodes
+      end
+
+      def setup_rendezvous!
+        @rdzv_backend = normalize_backend(@options[:rdzv_backend])
+        @rdzv_conf = @options[:rdzv_conf] || {}
+        if @options[:standalone]
+          configure_standalone_rendezvous
+        else
+          configure_static_rendezvous
+        end
+      end
+
+      def normalize_backend(value)
+        backend = value.to_s.downcase
+        raise Error, "Unsupported rendezvous backend: #{value.inspect}" unless %w[static c10d].include?(backend)
+
+        backend
+      end
+
+      def configure_standalone_rendezvous
+        @standalone = true
+        @rdzv_backend = "c10d"
+        @rdzv_id = SecureRandom.uuid
+        @master_addr = "127.0.0.1"
+        @master_port = find_free_port(@master_addr)
+        log(<<~MSG)
+
+          **************************************
+          Rendezvous info:
+          --rdzv-backend=#{@rdzv_backend}
+          --rdzv-endpoint=#{@master_addr}:#{@master_port}
+          --rdzv-id=#{@rdzv_id}
+          **************************************
+
+        MSG
+      end
+
+      def configure_static_rendezvous
+        @standalone = false
+        endpoint_host, endpoint_port = parse_endpoint(@options[:rdzv_endpoint])
+        @master_addr = endpoint_host || @options[:master_addr]
+        @master_port = endpoint_port || @options[:master_port]
+        @rdzv_id = @options[:rdzv_id]
+        raise Error, "MASTER_ADDR must be provided" if @master_addr.to_s.empty?
+        raise Error, "MASTER_PORT must be > 0" unless @master_port.to_i.positive?
+      end
+
+      def parse_endpoint(value)
+        return [nil, nil] if value.nil? || value.strip.empty?
+
+        host, port_str = value.split(":", 2)
+        port = port_str ? Integer(port_str, exception: false) : nil
+        raise Error, "Invalid rendezvous endpoint: #{value.inspect}" if host.to_s.empty? || (port_str && port.nil?)
+
+        [host, port]
+      end
+
+      def find_free_port(host)
+        server = TCPServer.new(host, 0)
+        server.addr[1]
+      ensure
+        server&.close
+      end
+
+      def log(message)
+        @out.puts(message)
+      end
+    end
+  end
+end
diff --git a/test/distributed_test.rb b/test/distributed_test.rb
new file mode 100644
index 00000000..487f3af0
--- /dev/null
+++ b/test/distributed_test.rb
@@ -0,0 +1,77 @@
+require_relative "test_helper"
+require "socket"
+
+class DistributedTest < Minitest::Test
+  def setup
+    super
+    skip "Distributed backend not available" unless Torch::Distributed.available?
+  end
+
+  def test_all_reduce
+    results = Torch::Distributed.fork_world(2) do |rank, port|
+      store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?)
+      Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2)
+
+      tensor = Torch.tensor([rank + 1.0])
+      Torch::Distributed.all_reduce(tensor)
+      Torch::Distributed.destroy_process_group
+      tensor.to_a
+    end
+
+    assert_equal [[3.0], [3.0]], results
+  end
+
+  def test_barrier
+    wait_times = Torch::Distributed.fork_world(2) do |rank, port|
+      store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?)
+      Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2)
+
+      sleep 0.3 if rank.zero?
+      before = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+      Torch::Distributed.barrier
+      after = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+      Torch::Distributed.destroy_process_group
+      after - before
+    end
+
+    assert_operator wait_times.first, :<, 0.1
+    assert_operator wait_times.last, :>=, 0.25
+  end
+
+  def test_broadcast
+    tensors = Torch::Distributed.fork_world(2) do |rank, port|
+      store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?)
+      Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2)
+
+      tensor = Torch.tensor([rank + 1.0])
+      Torch::Distributed.broadcast(tensor, src: 0)
+      Torch::Distributed.destroy_process_group
+      tensor.to_a
+    end
+
+    assert_equal [[1.0], [1.0]], tensors
+  end
+
+  def test_ddp_gradient_sync
+    grads = Torch::Distributed.fork_world(2) do |rank, port|
+      store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?)
+      Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2)
+
+      model = Torch::NN::Linear.new(1, 1, bias: false)
+      ddp = Torch::NN::Parallel::DistributedDataParallel.new(model)
+      input = Torch.tensor([[rank + 1.0]])
+      output = ddp.call(input)
+      loss = output.sum
+      loss.backward
+
+      grad = model.parameters.first.grad.item
+      Torch::Distributed.destroy_process_group
+      grad
+    end
+
+    grads.each do |grad|
+      assert_in_delta 1.5, grad, 1e-6
+    end
+  end
+
+end
diff --git a/test/save_test.rb b/test/save_test.rb
index a7438e03..640fdf25 100644
--- a/test/save_test.rb
+++ b/test/save_test.rb
@@ -55,6 +55,61 @@ def test_load_missing
     assert_equal "No such file or directory @ rb_sysopen - missing.bin", error.message
   end
 
+  def test_load_with_map_location_string
+    tmpfile = Tempfile.new
+    tensor = Torch.tensor([1, 2, 3])
+    Torch.save(tensor, tmpfile.path)
+    loaded = Torch.load(tmpfile.path, map_location: "cpu")
+    assert_equal tensor.to_a, loaded.to_a
+  end
+
+  def test_load_with_map_location_callable
+    tmpfile = Tempfile.new
+    tensor = Torch.tensor([1, 2, 3])
+    Torch.save(tensor, tmpfile.path)
+    seen = []
+    loaded = Torch.load(tmpfile.path, map_location: lambda { |value, loc|
+      seen << loc
+      value
+    })
+    assert_equal tensor.to_a, loaded.to_a
+    assert_equal ["cpu"], seen
+  end
+
+  def test_load_with_weights_only
+    tmpfile = Tempfile.new
+    tensor = Torch.tensor([1, 2, 3])
+    Torch.save(tensor, tmpfile.path)
+    loaded = Torch.load(tmpfile.path, weights_only: true)
+    assert_equal tensor.to_a, loaded.to_a
+  end
+
+  def test_load_map_location_cuda_to_cpu
+    skip "Requires CUDA" unless Torch::CUDA.available?
+
+    tmpfile = Tempfile.new
+    tensor = Torch.tensor([1, 2, 3]).cuda
+    Torch.save(tensor, tmpfile.path)
+
+    loaded = Torch.load(tmpfile.path, map_location: "cpu")
+    assert_equal "cpu", loaded.device.type
+    assert_equal tensor.cpu.to_a, loaded.to_a
+  end
+
+  def test_load_map_location_cpu_to_cuda
+    skip "Requires CUDA" unless Torch::CUDA.available?
+
+    tmpfile = Tempfile.new
+    tensor = Torch.tensor([1, 2, 3])
+    Torch.save(tensor, tmpfile.path)
+
+    device = "cuda:0"
+    loaded = Torch.load(tmpfile.path, map_location: device)
+    assert_equal "cuda", loaded.device.type
+    assert_equal 0, loaded.device.index
+    assert_equal tensor.to_a, loaded.cpu.to_a
+  end
+
   private
 
   def assert_save(obj)
diff --git a/test/support/scripts/show_ranks.rb b/test/support/scripts/show_ranks.rb
new file mode 100644
index 00000000..6654dfcb
--- /dev/null
+++ b/test/support/scripts/show_ranks.rb
@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+
+$stdout.sync = true
+rank = ENV.fetch("RANK", "unknown")
+local_rank = ENV.fetch("LOCAL_RANK", "unknown")
+world_size = ENV.fetch("WORLD_SIZE", "unknown")
+puts "RANK=#{rank} LOCAL_RANK=#{local_rank} WORLD_SIZE=#{world_size}"
diff --git a/test/torchrun_test.rb b/test/torchrun_test.rb
new file mode 100644
index 00000000..a3cf7a38
--- /dev/null
+++ b/test/torchrun_test.rb
@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+
+require_relative "test_helper"
+
+require "open3"
+require "rbconfig"
+
+class TorchRunTest < Minitest::Test
+  def test_standalone_launches_multiple_workers
+    script = File.expand_path("support/scripts/show_ranks.rb", __dir__)
+    torchrun = File.expand_path("../bin/torchrun", __dir__)
+    stdout, stderr, status = Open3.capture3(
+      {"TORCHRUN_TEST" => "1"},
+      RbConfig.ruby,
+      torchrun,
+      "--standalone",
+      "--nproc-per-node=2",
+      script
+    )
+
+    assert status.success?, "torchrun failed: #{stderr}"
+
+    lines = stdout.lines.map(&:strip).select { |line| line.start_with?("RANK=") }
+    assert_equal 2, lines.size, "expected two worker outputs, got: #{lines.inspect}"
+    ranks = lines.map do |line|
+      match = line.match(/RANK=(\d+)\s+LOCAL_RANK=(\d+)\s+WORLD_SIZE=(\d+)/)
+      raise "unexpected output: #{line}" unless match
+
+      [match[1].to_i, match[2].to_i, match[3].to_i]
+    end
+    assert_equal [[0, 0, 2], [1, 1, 2]], ranks.sort
+  end
+end
diff --git a/torch-rb.gemspec b/torch-rb.gemspec
index 0adcc03b..40c89325 100644
--- a/torch-rb.gemspec
+++ b/torch-rb.gemspec
@@ -10,7 +10,9 @@ Gem::Specification.new do |spec|
   spec.author        = "Andrew Kane"
   spec.email         = "andrew@ankane.org"
 
-  spec.files         = Dir["*.{md,txt}", "{codegen,ext,lib}/**/*"]
+  spec.files         = Dir["*.{md,txt}", "{codegen,ext,lib}/**/*", "bin/*"]
+  spec.executables   = Dir["bin/*"].map { |file| File.basename(file) }
+  spec.bindir        = "bin"
   spec.require_path  = "lib"
   spec.extensions    = ["ext/torch/extconf.rb"]
 

From 20e7845f0ed66eb2212327a38d4f02f9cefbfddd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i@orlando-labs.com>
Date: Sat, 15 Nov 2025 10:39:32 +0300
Subject: [PATCH 19/28] Updated distributed example

---
 examples/mnist/distributed.rb | 79 +++++++++++++++++++++++++----------
 1 file changed, 57 insertions(+), 22 deletions(-)

diff --git a/examples/mnist/distributed.rb b/examples/mnist/distributed.rb
index 531b0f87..91b6d52c 100644
--- a/examples/mnist/distributed.rb
+++ b/examples/mnist/distributed.rb
@@ -5,12 +5,14 @@
 require "optparse"
 require "torch"
 require "torchvision"
-require "socket"
+require "tmpdir"
 
 unless Torch::Distributed.available?
   abort "torch.distributed was not built in this binary"
 end
 
+DEFAULT_CHECKPOINT_PATH = File.join(Dir.tmpdir, "mnist_ddp_checkpoint.pt")
+
 class MyNet < Torch::NN::Module
   def initialize
     super()
@@ -43,7 +45,9 @@ def parse_options
     backend: "gloo",
     gpus: Torch::CUDA.available? ? [Torch::CUDA.device_count, 1].max : 1,
     log_interval: 20,
-    data_dir: File.join(__dir__, "data")
+    data_dir: File.join(__dir__, "data"),
+    checkpoint_path: DEFAULT_CHECKPOINT_PATH,
+    resume: false
   }
 
   OptionParser.new do |opts|
@@ -56,28 +60,13 @@ def parse_options
     opts.on("--gpus N", Integer, "Number of GPUs/processes to use") { |v| defaults[:gpus] = v }
     opts.on("--log-interval N", Integer, "Batches between log statements") { |v| defaults[:log_interval] = v }
     opts.on("--data-dir PATH", String, "Directory for cached MNIST data") { |v| defaults[:data_dir] = v }
+    opts.on("--checkpoint PATH", String, "Checkpoint file to save to (default: #{defaults[:checkpoint_path]})") { |v| defaults[:checkpoint_path] = v }
+    opts.on("--resume", "Load checkpoint weights before training if the file exists") { defaults[:resume] = true }
   end.parse!(ARGV)
 
   defaults
 end
 
-def free_port
-  server = TCPServer.new("127.0.0.1", 0)
-  port = server.addr[1]
-  server.close
-  port
-end
-
-def spawn_workers(world_size)
-  port = free_port
-
-  world_size.times.map do |rank|
-    fork do
-      yield(rank, world_size, port)
-    end
-  end.each { Process.wait2(_1) }
-end
-
 def load_datasets(rank, data_dir)
   transforms = TorchVision::Transforms::Compose.new([
     TorchVision::Transforms::ToTensor.new,
@@ -102,6 +91,39 @@ def subset_for_rank(dataset, rank, world_size)
   Torch::Utils::Data::Subset.new(dataset, indices)
 end
 
+def checkpoint_map_location(device, rank)
+  accelerator_device = Torch::Accelerator.current_accelerator
+  return nil unless accelerator_device
+
+  accelerator_type = accelerator_device.type
+  target_index = device.index
+  if target_index.nil? && Torch::Accelerator.respond_to?(:device_count)
+    count = Torch::Accelerator.device_count
+    target_index = count.positive? ? rank % count : 0
+  end
+  { "#{accelerator_type}:0" => "#{accelerator_type}:#{target_index}" }
+end
+
+def load_checkpoint_if_present(ddp, device, rank, path)
+  return false unless path && File.exist?(path)
+
+  Torch::Distributed.barrier
+  kwargs = { weights_only: true }
+  map_location = checkpoint_map_location(device, rank)
+  kwargs[:map_location] = map_location if map_location
+  state_dict = Torch.load(path, **kwargs)
+  ddp.module.load_state_dict(state_dict)
+  true
+end
+
+def save_checkpoint(ddp, path, rank)
+  return unless path
+
+  Torch.save(ddp.module.state_dict, path) if rank.zero?
+  Torch::Distributed.barrier
+  puts "Saved checkpoint to #{path}" if rank.zero?
+end
+
 def train_epoch(model, device, loader, optimizer, epoch, rank, log_interval)
   model.train
   loader.each_with_index do |(data, target), batch_idx|
@@ -163,6 +185,18 @@ def run_worker(rank, world_size, port, options)
   train_subset = subset_for_rank(train_dataset, rank, world_size)
   train_loader = Torch::Utils::Data::DataLoader.new(train_subset, batch_size: options[:batch_size], shuffle: true)
   test_loader = Torch::Utils::Data::DataLoader.new(test_dataset, batch_size: options[:batch_size], shuffle: false) if rank.zero?
+  checkpoint_path = options[:checkpoint_path]
+
+  if options[:resume]
+    loaded = load_checkpoint_if_present(ddp, device, rank, checkpoint_path)
+    if rank.zero?
+      if loaded
+        puts "Loaded checkpoint weights from #{checkpoint_path}"
+      else
+        puts "No checkpoint found at #{checkpoint_path}, starting from random initialization"
+      end
+    end
+  end
 
   options[:epochs].times do |epoch_idx|
     epoch = epoch_idx + 1
@@ -170,6 +204,7 @@ def run_worker(rank, world_size, port, options)
     if rank.zero?
       evaluate(ddp.module, device, test_loader)
     end
+    save_checkpoint(ddp, checkpoint_path, rank) if checkpoint_path
   end
 
   Torch::Distributed.destroy_process_group
@@ -190,9 +225,9 @@ def run_worker(rank, world_size, port, options)
 Torch.manual_seed(1)
 
 if world_size == 1
-  run_worker(0, 1, free_port, options)
+  run_worker(0, 1, Torch::Distributed.free_port, options)
 else
-  spawn_workers(world_size) do |rank, total, port|
-    run_worker(rank, total, port, options)
+  Torch::Distributed.fork_world(world_size) do |rank, port|
+    run_worker(rank, world_size, port, options)
   end
 end

From 52282cb32bf0b91b15aed2e941b2d0a809bb8242 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i@orlando-labs.com>
Date: Mon, 17 Nov 2025 12:25:09 +0300
Subject: [PATCH 20/28] inter-device map_location fixed

---
 ext/torch/torch.cpp | 30 ++++++++++++++++++++++++++++++
 lib/torch.rb        | 33 ++++++++++++++++++++++++++++++++-
 lib/torch/device.rb | 16 ++++++++++++++++
 lib/torch/tensor.rb |  3 +--
 4 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/ext/torch/torch.cpp b/ext/torch/torch.cpp
index 20e4c4d4..ded002af 100644
--- a/ext/torch/torch.cpp
+++ b/ext/torch/torch.cpp
@@ -1,8 +1,14 @@
 #include <fstream>
+#include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
 #include <torch/torch.h>
+#include <torch/csrc/jit/serialization/import_read.h>
+#include <torch/csrc/jit/serialization/pickle.h>
+
+#include <caffe2/serialize/in_memory_adapter.h>
 
 #include <rice/rice.hpp>
 #include <rice/stl.hpp>
@@ -76,6 +82,30 @@ void init_torch(Rice::Module& m) {
         input.close();
         return torch::pickle_load(bytes);
       })
+    .define_singleton_function(
+      "_load_with_device",
+      [](const std::string &filename, const std::string &device_str) {
+        std::ifstream input(filename, std::ios::binary);
+        std::vector<char> bytes(
+            (std::istreambuf_iterator<char>(input)),
+            (std::istreambuf_iterator<char>()));
+        input.close();
+
+        auto device = c10::Device(device_str);
+        auto reader = std::make_shared<caffe2::serialize::MemoryReadAdapter>(
+            bytes.data(),
+            static_cast<off_t>(bytes.size()));
+        caffe2::serialize::PyTorchStreamReader stream_reader(reader);
+
+        return torch::jit::readArchiveAndTensors(
+            "data",
+            /*pickle_prefix=*/"",
+            /*tensor_prefix=*/"",
+            /*type_resolver=*/std::nullopt,
+            /*obj_loader=*/std::nullopt,
+            /*device=*/device,
+            stream_reader);
+      })
     .define_singleton_function(
       "_from_blob",
       [](Rice::String s, const std::vector<int64_t> &size, const torch::TensorOptions &options) {
diff --git a/lib/torch.rb b/lib/torch.rb
index ce213e1d..dd652872 100644
--- a/lib/torch.rb
+++ b/lib/torch.rb
@@ -406,7 +406,19 @@ def load(filename, map_location: nil, weights_only: false)
       # keep backwards compatibility
       File.open(filename, "rb") { |f| f.read(1) }
 
-      result = to_ruby(_load(filename))
+      load_device = map_location_device(map_location) if map_location
+      result =
+        if load_device
+          device_str =
+            if load_device.respond_to?(:_str)
+              load_device._str
+            else
+              load_device.to_s
+            end
+          to_ruby(_load_with_device(filename, device_str))
+        else
+          to_ruby(_load(filename))
+        end
       ensure_weights_only_contents!(result) if weights_only
       result = apply_map_location(result, map_location) if map_location
       result
@@ -570,6 +582,25 @@ def ensure_weights_only_contents!(obj)
       end
     end
 
+    def map_location_device(map_location)
+      case map_location
+      when Device, String, Symbol
+        normalize_map_location_device(map_location)
+      when Hash
+        devices =
+          map_location.values.map do |value|
+            normalize_map_location_device(value)
+          rescue StandardError
+            nil
+          end.compact
+        return nil if devices.empty?
+        devices.uniq!
+        devices.one? ? devices.first : nil
+      else
+        nil
+      end
+    end
+
     def apply_map_location(obj, map_location)
       case obj
       when Tensor
diff --git a/lib/torch/device.rb b/lib/torch/device.rb
index 45a822a8..f80868ff 100644
--- a/lib/torch/device.rb
+++ b/lib/torch/device.rb
@@ -22,4 +22,20 @@ def hash
       [type, index].hash
     end
   end
+
+  # String-like wrapper that also exposes device metadata
+  class DeviceString < String
+    def initialize(device)
+      @device = device
+      super(device._str)
+    end
+
+    def type
+      @device.type
+    end
+
+    def index
+      @device.index
+    end
+  end
 end
diff --git a/lib/torch/tensor.rb b/lib/torch/tensor.rb
index ed8ab71e..318f14e0 100644
--- a/lib/torch/tensor.rb
+++ b/lib/torch/tensor.rb
@@ -211,9 +211,8 @@ def coerce(other)
       end
     end
 
-    # TODO return Device instead of String in 0.19.0
     def device
-      _device._str
+      DeviceString.new(_device)
     end
   end
 end

From cac2534a771f9663ddea9ad06fcc8a97d1e27f11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i@orlando-labs.com>
Date: Mon, 17 Nov 2025 13:02:57 +0300
Subject: [PATCH 21/28] autodetecting libtorch distributed support

---
 ext/torch/extconf.rb | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/ext/torch/extconf.rb b/ext/torch/extconf.rb
index cf3c6706..0a8250b0 100644
--- a/ext/torch/extconf.rb
+++ b/ext/torch/extconf.rb
@@ -70,6 +70,45 @@
   $LDFLAGS += " -Wl,--no-as-needed,#{lib}/libtorch.so"
 end
 
+supports_c10d = try_link(<<~CPP, "-DUSE_C10D")
+  #include <torch/torch.h>
+  #include <torch/csrc/distributed/c10d/FileStore.hpp>
+
+  int main() {
+    ::c10d::FileStore store("unused", 1);
+    return 0;
+  }
+CPP
+
+supports_c10d_gloo = supports_c10d && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_GLOO")
+  #include <torch/torch.h>
+  #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
+  #include <torch/csrc/distributed/c10d/FileStore.hpp>
+
+  int main() {
+    auto store = c10::make_intrusive<::c10d::FileStore>("unused", 1);
+    auto opts = ::c10d::ProcessGroupGloo::Options::create();
+    opts->devices.push_back(::c10d::ProcessGroupGloo::createDefaultDevice());
+    ::c10d::ProcessGroupGloo pg(store, 0, 1, opts);
+    return static_cast<int>(pg.getRank());
+  }
+CPP
+
+supports_c10d_nccl = with_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
+  #include <torch/torch.h>
+  #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
+
+  int main() {
+    auto opts = c10::make_intrusive<::c10d::ProcessGroupNCCL::Options>();
+    opts->is_high_priority_stream = false;
+    return 0;
+  }
+CPP
+
+$defs << "-DUSE_C10D" if supports_c10d
+$defs << "-DUSE_C10D_GLOO" if supports_c10d_gloo
+$defs << "-DUSE_C10D_NCCL" if supports_c10d_nccl
+
 # generate C++ functions
 puts "Generating C++ functions..."
 require_relative "../../codegen/generate_functions"

From d8dc2953e4cbf995d1f2811ad56eebbc4649f113 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i@orlando-labs.com>
Date: Wed, 19 Nov 2025 13:21:03 +0300
Subject: [PATCH 22/28] DDP fixes and improvements

---
 README.md                                     |  11 +
 examples/benchmark/training.rb                | 207 +++++++++++
 examples/mnist/distributed.rb                 |  11 +-
 ext/torch/cuda.cpp                            |   9 +-
 ext/torch/distributed.cpp                     |  98 ++++-
 ext/torch/extconf.rb                          |  22 +-
 ext/torch/tensor.cpp                          |  14 +-
 lib/torch/distributed.rb                      | 338 +++++++++++++++---
 .../nn/parallel/distributed_data_parallel.rb  |  20 +-
 lib/torch/tensor.rb                           |   3 +-
 lib/torch/torchrun.rb                         |  26 +-
 test/distributed_test.rb                      | 193 ++++++++--
 test/test_helper.rb                           |  26 ++
 13 files changed, 880 insertions(+), 98 deletions(-)
 create mode 100644 examples/benchmark/training.rb

diff --git a/README.md b/README.md
index d477c8ca..78934193 100644
--- a/README.md
+++ b/README.md
@@ -56,9 +56,18 @@ A good place to start is [Deep Learning with Torch.rb: A 60 Minute Blitz](tutori
 
 - [Image classification with MNIST](examples/mnist) ([日本語版](https://qiita.com/kojix2/items/c19c36dc1bf73ea93409))
 - [Distributed MNIST training](examples/mnist/distributed.rb)
+- [Training benchmarks (variable batch size / GPU count)](examples/benchmark/training.rb)
 - [Collaborative filtering with MovieLens](examples/movielens)
 - [Generative adversarial networks](examples/gan)
 
+Run the benchmark with:
+
+```sh
+bundle exec ruby examples/benchmark/training.rb --arch mnist_cnn --batch-size 256 --gpus 1 --steps 50
+```
+
+Set `--gpus` to 2+ to enable distributed training; `--steps` measures only timed steps and `--warmup` sets warmup iterations.
+
 ## Distributed Training
 
 Torch.rb ships with a `torchrun` launcher that mirrors the PyTorch CLI. It handles process orchestration and sets the `RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, and `MASTER_PORT` environment variables expected by `Torch::Distributed.init_process_group`.
@@ -84,6 +93,8 @@ bundle exec torchrun \
 
 On node 1, change `--node-rank=1`. The launcher restarts workers up to `--max-restarts` times and can be combined with tools like `bundle exec` or custom scripts via `--no-ruby`.
 
+For scripts that use the `Torch::Distributed.fork_world` helper directly, set `start_method: :spawn` to launch fresh worker processes instead of forking. This matches Python’s multiprocessing start methods and avoids CUDA fork issues.
+
 ## API
 
 This library follows the [PyTorch API](https://pytorch.org/docs/stable/torch.html). There are a few changes to make it more Ruby-like:
diff --git a/examples/benchmark/training.rb b/examples/benchmark/training.rb
new file mode 100644
index 00000000..090018f6
--- /dev/null
+++ b/examples/benchmark/training.rb
@@ -0,0 +1,207 @@
+# Benchmark training throughput for common architectures/datasets.
+# Usage examples:
+#   ruby examples/benchmark/training.rb --arch mnist_cnn --batch-size 128 --gpus 1
+#   ruby examples/benchmark/training.rb --arch mnist_cnn --batch-size 128 --gpus 2 --steps 50
+
+require "bundler/setup"
+require "optparse"
+require "torch"
+require "torchvision"
+
+DEFAULT_BACKEND = if Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available?
+  "nccl"
+else
+  Torch::Distributed.get_default_backend_for_device(Torch::Accelerator.current_accelerator) || "gloo"
+end
+
+class MnistCnn < Torch::NN::Module
+  def initialize
+    super()
+    @conv1 = Torch::NN::Conv2d.new(1, 32, 3, stride: 1)
+    @conv2 = Torch::NN::Conv2d.new(32, 64, 3, stride: 1)
+    @dropout1 = Torch::NN::Dropout2d.new(p: 0.25)
+    @dropout2 = Torch::NN::Dropout2d.new(p: 0.5)
+    @fc1 = Torch::NN::Linear.new(9216, 128)
+    @fc2 = Torch::NN::Linear.new(128, 10)
+  end
+
+  def forward(x)
+    x = Torch::NN::F.relu(@conv1.call(x))
+    x = Torch::NN::F.relu(@conv2.call(x))
+    x = Torch::NN::F.max_pool2d(x, 2)
+    x = @dropout1.call(x)
+    x = Torch.flatten(x, start_dim: 1)
+    x = Torch::NN::F.relu(@fc1.call(x))
+    x = @dropout2.call(x)
+    Torch::NN::F.log_softmax(@fc2.call(x), 1)
+  end
+end
+
+ARCH_CONFIGS = {
+  "mnist_cnn" => {
+    model: -> { MnistCnn.new },
+    dataset: :mnist
+  }
+}.freeze
+
+def parse_options
+  defaults = {
+    arch: "mnist_cnn",
+    batch_size: 128,
+    steps: 100,
+    warmup: 10,
+    backend: DEFAULT_BACKEND,
+    gpus: Torch::CUDA.available? ? [Torch::CUDA.device_count, 1].max : 1,
+    data_dir: File.join(__dir__, "data"),
+    lr: 0.01
+  }
+
+  OptionParser.new do |opts|
+    opts.banner = "Usage: ruby examples/benchmark/training.rb [options]"
+    opts.on("--arch NAME", "Architecture to benchmark (#{ARCH_CONFIGS.keys.join(', ')}, default: #{defaults[:arch]})") { |v| defaults[:arch] = v }
+    opts.on("--batch-size N", Integer, "Batch size per process (default: #{defaults[:batch_size]})") { |v| defaults[:batch_size] = v }
+    opts.on("--steps N", Integer, "Number of timed training steps (default: #{defaults[:steps]})") { |v| defaults[:steps] = v }
+    opts.on("--warmup N", Integer, "Number of warmup steps not included in timing (default: #{defaults[:warmup]})") { |v| defaults[:warmup] = v }
+    opts.on("--backend NAME", String, "Process group backend (default: #{defaults[:backend]})") { |v| defaults[:backend] = v }
+    opts.on("--gpus N", Integer, "Number of GPUs/processes to use (1 for non-distributed)") { |v| defaults[:gpus] = v }
+    opts.on("--data-dir PATH", String, "Directory for cached datasets (default: #{defaults[:data_dir]})") { |v| defaults[:data_dir] = v }
+    opts.on("--lr FLOAT", Float, "Learning rate (default: #{defaults[:lr]})") { |v| defaults[:lr] = v }
+  end.parse!(ARGV)
+
+  defaults
+end
+
+def dataset_for(name, data_dir, distributed:, rank:, world_size:)
+  case name
+  when :mnist
+    transforms = TorchVision::Transforms::Compose.new([
+      TorchVision::Transforms::ToTensor.new,
+      TorchVision::Transforms::Normalize.new([0.1307], [0.3081])
+    ])
+
+    if distributed
+      if rank.zero?
+        train = TorchVision::Datasets::MNIST.new(data_dir, train: true, download: true, transform: transforms)
+        Torch::Distributed.barrier
+      else
+        Torch::Distributed.barrier
+        train = TorchVision::Datasets::MNIST.new(data_dir, train: true, download: false, transform: transforms)
+      end
+      indices = rank.step(train.size - 1, world_size).to_a
+      Torch::Utils::Data::Subset.new(train, indices)
+    else
+      TorchVision::Datasets::MNIST.new(data_dir, train: true, download: true, transform: transforms)
+    end
+  else
+    raise ArgumentError, "Unknown dataset: #{name}"
+  end
+end
+
+def sync_cuda_if_needed(device)
+  return unless device && device.type == "cuda"
+  return unless Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:synchronize)
+
+  Torch::CUDA.synchronize
+end
+
+def benchmark_worker(rank, world_size, port, options)
+  arch = options.fetch(:arch)
+  config = ARCH_CONFIGS[arch]
+  raise ArgumentError, "Unsupported architecture #{arch.inspect}" unless config
+
+  distributed = world_size > 1
+  if distributed
+    store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?)
+    accelerator = Torch::Accelerator.current_accelerator
+    backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator) || DEFAULT_BACKEND
+    Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size)
+  end
+
+  device = if Torch::CUDA.available? && options[:gpus] > 0
+    Torch.device("cuda:#{rank % Torch::CUDA.device_count}")
+  else
+    Torch.device("cpu")
+  end
+
+  model = config[:model].call.to(device)
+  if distributed
+    ddp_devices = device.type == "cuda" ? [device.index] : nil
+    model = Torch::NN::Parallel::DistributedDataParallel.new(model, device_ids: ddp_devices)
+  end
+  optimizer = Torch::Optim::SGD.new(model.parameters, lr: options[:lr])
+
+  loader = Torch::Utils::Data::DataLoader.new(
+    dataset_for(config[:dataset], options[:data_dir], distributed: distributed, rank: rank, world_size: world_size),
+    batch_size: options[:batch_size],
+    shuffle: true
+  )
+
+  warmup_steps = options[:warmup]
+  timed_steps = options[:steps]
+  total_steps = warmup_steps + timed_steps
+
+  step_idx = 0
+  loader.each do |data, target|
+    data = data.to(device)
+    target = target.to(device)
+
+    optimizer.zero_grad
+    loss = Torch::NN::F.nll_loss(model.call(data), target)
+    loss.backward
+    optimizer.step
+
+    step_idx += 1
+    break if step_idx >= total_steps
+  end
+
+  sync_cuda_if_needed(device)
+  Torch::Distributed.barrier if distributed
+
+  timed = 0
+  step_idx = 0
+  start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+  loader.each do |data, target|
+    data = data.to(device)
+    target = target.to(device)
+
+    optimizer.zero_grad
+    loss = Torch::NN::F.nll_loss(model.call(data), target)
+    loss.backward
+    optimizer.step
+
+    step_idx += 1
+    break if step_idx >= timed_steps
+  end
+
+  sync_cuda_if_needed(device)
+  Torch::Distributed.barrier if distributed
+  elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
+  timed = step_idx
+
+  if rank.zero?
+    images = timed * options[:batch_size] * world_size
+    puts "Architecture: #{arch}"
+    puts "Dataset: #{config[:dataset]}"
+    puts "GPUs: #{world_size}"
+    puts "Batch size per process: #{options[:batch_size]}"
+    puts "Timed steps: #{timed}"
+    puts "Total images: #{images}"
+    puts format("Elapsed: %.3fs | Throughput: %.1f images/s", elapsed, images / elapsed)
+  end
+
+  Torch::Distributed.destroy_process_group if distributed
+end
+
+options = parse_options
+world_size = options[:gpus]
+raise "Number of GPUs requested must be >= 1" if world_size < 1
+Torch.manual_seed(1)
+
+if world_size > 1
+  raise "torch.distributed is not available" unless Torch::Distributed.available?
+  Torch::Distributed.fork_world(world_size, start_method: :spawn) do |rank, port|
+    benchmark_worker(rank, world_size, port, options)
+  end
+else
+  benchmark_worker(0, 1, Torch::Distributed.free_port, options)
+end
diff --git a/examples/mnist/distributed.rb b/examples/mnist/distributed.rb
index 91b6d52c..d300ead0 100644
--- a/examples/mnist/distributed.rb
+++ b/examples/mnist/distributed.rb
@@ -12,6 +12,11 @@
 end
 
 DEFAULT_CHECKPOINT_PATH = File.join(Dir.tmpdir, "mnist_ddp_checkpoint.pt")
+DEFAULT_BACKEND = if Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available?
+  "nccl"
+else
+  Torch::Distributed.get_default_backend_for_device(Torch::Accelerator.current_accelerator) || "gloo"
+end
 
 class MyNet < Torch::NN::Module
   def initialize
@@ -42,7 +47,7 @@ def parse_options
     batch_size: 64,
     lr: 1.0,
     gamma: 0.7,
-    backend: "gloo",
+    backend: DEFAULT_BACKEND,
     gpus: Torch::CUDA.available? ? [Torch::CUDA.device_count, 1].max : 1,
     log_interval: 20,
     data_dir: File.join(__dir__, "data"),
@@ -167,7 +172,7 @@ def evaluate(model, device, loader)
 def run_worker(rank, world_size, port, options)
   store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?)
   accelerator = Torch::Accelerator.current_accelerator
-  backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator)
+  backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator) || DEFAULT_BACKEND
   Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size)
 
   device = if Torch::CUDA.available? && options[:gpus] > 0
@@ -227,7 +232,7 @@ def run_worker(rank, world_size, port, options)
 if world_size == 1
   run_worker(0, 1, Torch::Distributed.free_port, options)
 else
-  Torch::Distributed.fork_world(world_size) do |rank, port|
+  Torch::Distributed.fork_world(world_size, start_method: :spawn) do |rank, port|
     run_worker(rank, world_size, port, options)
   end
 end
diff --git a/ext/torch/cuda.cpp b/ext/torch/cuda.cpp
index 23f38d80..69b2529f 100644
--- a/ext/torch/cuda.cpp
+++ b/ext/torch/cuda.cpp
@@ -1,4 +1,5 @@
 #include <torch/torch.h>
+#include <c10/cuda/CUDAFunctions.h>
 
 #include <rice/rice.hpp>
 
@@ -9,5 +10,11 @@ void init_cuda(Rice::Module& m) {
     .define_singleton_function("available?", &torch::cuda::is_available)
     .define_singleton_function("device_count", &torch::cuda::device_count)
     .define_singleton_function("manual_seed", &torch::cuda::manual_seed)
-    .define_singleton_function("manual_seed_all", &torch::cuda::manual_seed_all);
+    .define_singleton_function("manual_seed_all", &torch::cuda::manual_seed_all)
+    .define_singleton_function(
+        "set_device",
+        [](int device_id) {
+          c10::cuda::set_device(device_id);
+          return Rice::Nil;
+        });
 }
diff --git a/ext/torch/distributed.cpp b/ext/torch/distributed.cpp
index de5f7c9a..b3a22bc3 100644
--- a/ext/torch/distributed.cpp
+++ b/ext/torch/distributed.cpp
@@ -1,11 +1,16 @@
 #include <algorithm>
 #include <chrono>
 #include <cctype>
+#include <cstdlib>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include <torch/torch.h>
+#if defined(USE_C10D) && defined(USE_C10D_NCCL)
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAFunctions.h>
+#endif
 
 #include <rice/rice.hpp>
 #include <rice/stl.hpp>
@@ -13,6 +18,7 @@
 #include "utils.h"
 
 #ifdef USE_C10D
+#include <torch/csrc/distributed/c10d/Backend.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
@@ -34,7 +40,7 @@ namespace {
 #ifdef USE_C10D
 
 using StorePtr = c10::intrusive_ptr<::c10d::Store>;
-using ProcessGroupPtr = c10::intrusive_ptr<::c10d::ProcessGroup>;
+using ProcessGroupPtr = c10::intrusive_ptr<::c10d::Backend>;
 
 struct StoreWrapper {
   StoreWrapper() = default;
@@ -51,6 +57,24 @@ struct ProcessGroupWrapper {
 };
 
 ProcessGroupPtr default_process_group;
+std::once_flag default_pg_cleanup_once;
+
+void shutdown_default_process_group() {
+  if (default_process_group) {
+    try {
+      default_process_group->shutdown();
+    } catch (...) {
+      // best effort; ensure reset still happens
+    }
+    default_process_group.reset();
+  }
+}
+
+void register_default_pg_cleanup() {
+  std::call_once(default_pg_cleanup_once, []() {
+    std::atexit([]() { shutdown_default_process_group(); });
+  });
+}
 
 ProcessGroupPtr resolve_process_group(Rice::Object pg_obj) {
   if (pg_obj.is_nil()) {
@@ -80,6 +104,7 @@ int reduce_op_from_int(int code) {
 void init_distributed(Rice::Module& m) {
   auto rb_mDistributed = Rice::define_module_under(m, "Distributed");
 #ifdef USE_C10D
+  register_default_pg_cleanup();
   rb_mDistributed.define_singleton_function("available?", []() { return true; });
 
   auto rb_cStore = Rice::define_class_under<StoreWrapper>(rb_mDistributed, "Store");
@@ -116,7 +141,7 @@ void init_distributed(Rice::Module& m) {
                   int world_size,
                   bool is_master,
                   int64_t timeout_millis,
-                  bool wait_for_workers) {
+                  bool wait_for_workers) -> Rice::Object {
         ::c10d::TCPStoreOptions opts;
         opts.port = static_cast<uint16_t>(port);
         opts.isServer = is_master;
@@ -124,22 +149,23 @@ void init_distributed(Rice::Module& m) {
         opts.waitWorkers = wait_for_workers;
         opts.timeout = std::chrono::milliseconds(timeout_millis);
         auto store = c10::make_intrusive<::c10d::TCPStore>(host, opts);
-        return Rice::Data_Object<StoreWrapper>(new StoreWrapper(store), rb_cStore, true);
+        // Pass ownership first, then the Ruby class so Rice doesn't treat the class as the owner flag
+        return Rice::Data_Object<StoreWrapper>(new StoreWrapper(store), true, rb_cStore);
       });
 
   rb_mDistributed.define_singleton_function(
       "_create_file_store",
-      [rb_cStore](const std::string& path, int world_size) {
+      [rb_cStore](const std::string& path, int world_size) -> Rice::Object {
         auto store = c10::make_intrusive<::c10d::FileStore>(path, world_size);
-        return Rice::Data_Object<StoreWrapper>(new StoreWrapper(store), rb_cStore, true);
+        return Rice::Data_Object<StoreWrapper>(new StoreWrapper(store), true, rb_cStore);
       });
 
 #if !defined(_WIN32)
   rb_mDistributed.define_singleton_function(
       "_create_hash_store",
-      [rb_cStore]() {
+      [rb_cStore]() -> Rice::Object {
         auto store = c10::make_intrusive<::c10d::HashStore>();
-        return Rice::Data_Object<StoreWrapper>(new StoreWrapper(store), rb_cStore, true);
+        return Rice::Data_Object<StoreWrapper>(new StoreWrapper(store), true, rb_cStore);
       });
 #endif
 
@@ -149,7 +175,8 @@ void init_distributed(Rice::Module& m) {
                          StoreWrapper& store_wrapper,
                          int rank,
                          int world_size,
-                         int64_t timeout_millis) {
+                         int64_t timeout_millis,
+                         int device_id) -> Rice::Object {
         StorePtr store = store_wrapper.store_;
         if (!store) {
           rb_raise(rb_eArgError, "Store is required for init_process_group");
@@ -179,14 +206,32 @@ void init_distributed(Rice::Module& m) {
           rb_raise(rb_eArgError, "Unsupported backend: %s", backend.c_str());
         }
 
+        if (device_id >= 0 && backend_lower == "nccl") {
+#if defined(USE_C10D_NCCL)
+          if (!torch::cuda::is_available()) {
+            rb_raise(rb_eRuntimeError, "CUDA is not available for NCCL backend");
+          }
+          auto device_count = torch::cuda::device_count();
+          if (device_id >= static_cast<int>(device_count)) {
+            rb_raise(
+                rb_eArgError,
+                "Invalid device_id %d for NCCL backend (available devices: %d)",
+                device_id,
+                static_cast<int>(device_count));
+          }
+          c10::cuda::set_device(device_id);
+          pg->setBoundDeviceId(c10::Device(c10::kCUDA, device_id));
+#endif
+        }
+
         default_process_group = pg;
-        return Rice::Data_Object<ProcessGroupWrapper>(new ProcessGroupWrapper(pg), rb_cProcessGroup, true);
+        return Rice::Data_Object<ProcessGroupWrapper>(new ProcessGroupWrapper(pg), true, rb_cProcessGroup);
       });
 
   rb_mDistributed.define_singleton_function(
       "_destroy_process_group",
       []() {
-        default_process_group.reset();
+        shutdown_default_process_group();
         return Rice::Nil;
       });
 
@@ -198,11 +243,11 @@ void init_distributed(Rice::Module& m) {
 
   rb_mDistributed.define_singleton_function(
       "_default_process_group",
-      [rb_cProcessGroup]() {
+      [rb_cProcessGroup]() -> Rice::Object {
         if (!default_process_group) {
           return Rice::Nil;
         }
-        return Rice::Data_Object<ProcessGroupWrapper>(new ProcessGroupWrapper(default_process_group), rb_cProcessGroup, true);
+        return Rice::Data_Object<ProcessGroupWrapper>(new ProcessGroupWrapper(default_process_group), true, rb_cProcessGroup);
       });
 
   rb_mDistributed.define_singleton_function(
@@ -253,6 +298,33 @@ void init_distributed(Rice::Module& m) {
         return tensor;
       });
 
+  rb_mDistributed.define_singleton_function(
+      "_register_ddp_hook",
+      [](torch::Tensor& tensor, ProcessGroupWrapper& pg_wrapper, int world_size) -> unsigned {
+        if (!pg_wrapper.pg_) {
+          rb_raise(rb_eArgError, "Process group is required for DDP hook registration");
+        }
+        if (world_size <= 0) {
+          rb_raise(rb_eArgError, "world_size must be positive");
+        }
+
+        auto pg = pg_wrapper.pg_;
+        // Register a native autograd hook that all-reduces gradients and scales
+        // them by the world size. This avoids calling back into Ruby from
+        // autograd worker threads.
+        unsigned handle = tensor.register_hook([pg, world_size](const at::Tensor& grad) {
+          ::c10d::AllreduceOptions opts;
+          opts.reduceOp = ::c10d::ReduceOp::SUM;
+          std::vector<at::Tensor> tensors{grad};
+          auto work = pg->allreduce(tensors, opts);
+          work->wait();
+          grad.div_(static_cast<double>(world_size));
+          return grad;
+        });
+
+        return handle;
+      });
+
   auto rb_mReduceOp = Rice::define_module_under(rb_mDistributed, "ReduceOp");
   rb_mReduceOp.const_set("SUM", INT2NUM(static_cast<int>(::c10d::ReduceOp::SUM)));
   rb_mReduceOp.const_set("AVG", INT2NUM(static_cast<int>(::c10d::ReduceOp::AVG)));
@@ -264,7 +336,7 @@ void init_distributed(Rice::Module& m) {
   rb_mReduceOp.const_set("BXOR", INT2NUM(static_cast<int>(::c10d::ReduceOp::BXOR)));
   rb_mReduceOp.const_set("PREMUL_SUM", INT2NUM(static_cast<int>(::c10d::ReduceOp::PREMUL_SUM)));
 
-  rb_mDistributed.const_set("DEFAULT_TIMEOUT", INT2NUM(::c10d::kProcessGroupDefaultTimeout.count() / 1000));
+  rb_mDistributed.const_set("DEFAULT_TIMEOUT", INT2NUM(::kProcessGroupDefaultTimeout.count() / 1000));
 #else
   rb_mDistributed.define_singleton_function("available?", []() { return false; });
 #endif
diff --git a/ext/torch/extconf.rb b/ext/torch/extconf.rb
index 0a8250b0..2cadd1c9 100644
--- a/ext/torch/extconf.rb
+++ b/ext/torch/extconf.rb
@@ -47,6 +47,7 @@
 with_cuda = false
 if Dir["#{lib}/*torch_cuda*"].any?
   $LDFLAGS += " -L#{cuda_lib}" if Dir.exist?(cuda_lib)
+  $INCFLAGS += " -I#{cuda_inc}" if Dir.exist?(cuda_inc)
   $LDFLAGS += " -L#{cudnn_lib}" if Dir.exist?(cudnn_lib) && cudnn_lib != cuda_lib
   with_cuda = have_library("cuda") && have_library("cudnn")
 end
@@ -70,6 +71,9 @@
   $LDFLAGS += " -Wl,--no-as-needed,#{lib}/libtorch.so"
 end
 
+CONFIG["CC"] = CONFIG["CXX"]
+$CFLAGS = $CXXFLAGS
+
 supports_c10d = try_link(<<~CPP, "-DUSE_C10D")
   #include <torch/torch.h>
   #include <torch/csrc/distributed/c10d/FileStore.hpp>
@@ -105,9 +109,21 @@
   }
 CPP
 
-$defs << "-DUSE_C10D" if supports_c10d
-$defs << "-DUSE_C10D_GLOO" if supports_c10d_gloo
-$defs << "-DUSE_C10D_NCCL" if supports_c10d_nccl
+if supports_c10d
+  $defs << " -DUSE_C10D"
+  puts "Building with distributed support"
+else
+  puts "Building without distributed support"
+end
+
+if supports_c10d_gloo
+  $defs << "-DUSE_C10D_GLOO"
+  puts "GLOO support detected"
+end
+if supports_c10d_nccl
+  $defs << "-DUSE_C10D_NCCL"
+  puts "NCCL support detected"
+end
 
 # generate C++ functions
 puts "Generating C++ functions..."
diff --git a/ext/torch/tensor.cpp b/ext/torch/tensor.cpp
index c7e003d8..d5fb0bc3 100644
--- a/ext/torch/tensor.cpp
+++ b/ext/torch/tensor.cpp
@@ -6,6 +6,7 @@
 #include <torch/torch.h>
 
 #include <rice/rice.hpp>
+#include <ruby/ruby.h>
 #include <ruby/thread.h>
 
 #include "tensor_functions.h"
@@ -38,11 +39,22 @@ struct RubyTensorHook {
     rb_gc_register_address(&proc_);
   }
 
+  // The autograd engine can invoke hooks from threads not created by Ruby.
+  // Register the calling thread with Ruby before acquiring the GVL to avoid
+  // "rb_thread_call_with_gvl() is called by non-ruby thread" crashes.
+  static void ensure_ruby_thread_registered() {
+    // ruby_init_stack is idempotent and safe to call repeatedly; it ensures the
+    // current native thread is known to the VM before we try to grab the GVL.
+    volatile VALUE stack_anchor = Qnil;
+    ruby_init_stack(&stack_anchor);
+  }
+
   ~RubyTensorHook() {
     rb_gc_unregister_address(&proc_);
   }
 
   at::Tensor call(const at::Tensor& grad) {
+    ensure_ruby_thread_registered();
     HookCallData data{proc_, grad};
     rb_thread_call_with_gvl(&RubyTensorHook::invoke, &data);
     if (data.return_value_defined) {
@@ -121,7 +133,7 @@ VALUE tensor_register_hook(int argc, VALUE* argv, VALUE self_) {
     return hook->call(grad);
   });
 
-  return Rice::Data_Object<HookHandle>(new HookHandle(self, handle, hook), rb_cHookHandle, true);
+  return Rice::Data_Object<HookHandle>(new HookHandle(self, handle, hook), true, rb_cHookHandle);
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/lib/torch/distributed.rb b/lib/torch/distributed.rb
index 77428b52..7e1e739d 100644
--- a/lib/torch/distributed.rb
+++ b/lib/torch/distributed.rb
@@ -1,4 +1,5 @@
 require "socket"
+require "rbconfig"
 
 module Torch
   module Distributed
@@ -9,6 +10,15 @@ module Distributed
       "mps" => "gloo"
     }.freeze
 
+    SPAWN_ENV_KEY = "TORCH_DISTRIBUTED_SPAWNED".freeze
+    SPAWN_RANK_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_RANK".freeze
+    SPAWN_WORLD_SIZE_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_WORLD_SIZE".freeze
+    SPAWN_PORT_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_PORT".freeze
+    SPAWN_PIPE_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_PIPE".freeze
+    SPAWN_SCRIPT_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_SCRIPT".freeze
+    SPAWN_TEST_ENV_KEY = "TORCH_DISTRIBUTED_SPAWN_TEST".freeze
+    SPAWN_ARGV = ARGV.dup.freeze
+
     class << self
       def initialized?
         _initialized?
@@ -38,8 +48,15 @@ def init_process_group(backend = nil, init_method: "env://", store: nil, rank: n
         raise ArgumentError, "rank is required" if rank.nil?
         raise ArgumentError, "world_size is required" if world_size.nil?
 
+        device_id ||= default_device_id_for_backend(backend, rank, world_size)
+
         timeout_ms = (timeout * 1000).to_i
-        _init_process_group(backend, store, rank, world_size, timeout_ms)
+        bound_device_id = device_id.nil? ? -1 : Integer(device_id)
+        if backend == "nccl" && bound_device_id >= 0 && Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:set_device)
+          Torch::CUDA.set_device(bound_device_id)
+        end
+        pg = _init_process_group(backend, store, rank, world_size, timeout_ms, bound_device_id)
+        warmup_process_group(pg, backend)
       end
 
       def destroy_process_group
@@ -75,64 +92,101 @@ def broadcast(tensor, src:, group: nil)
         _broadcast(tensor, src, group)
       end
 
+      def register_ddp_hook(tensor, process_group, world_size)
+        ensure_process_group!(process_group)
+        _register_ddp_hook(tensor, process_group, Integer(world_size))
+      rescue NoMethodError
+        # Fallback for environments built without the native helper; this may
+        # still call back into Ruby from autograd threads.
+        tensor.register_hook do |grad|
+          all_reduce(grad, group: process_group)
+          grad.div!(world_size.to_f)
+        end
+      end
+
       def get_default_backend_for_device(device)
         backend = DEFAULT_DEVICE_BACKENDS[device_type_from(device)]
         raise ArgumentError, "Default backend not registered for device: #{device.inspect}" unless backend
         backend
       end
 
-      def fork_world(world_size, host: "127.0.0.1")
+      def fork_world(world_size, host: "127.0.0.1", start_method: :fork, &block)
         raise ArgumentError, "world_size must be positive" unless world_size.to_i.positive?
-        raise ArgumentError, "block required" unless block_given?
+        raise ArgumentError, "block required" unless block
 
+        start_method = normalize_start_method(start_method)
+        return run_spawn_worker(&block) if start_method == :spawn && spawn_worker?
+
+        fork_spawn_world(world_size, host: host, start_method: start_method, &block)
+      end
+
+      def fork_spawn_world(world_size, host:, start_method:, &block)
         port = free_port(host: host)
         readers = []
         pids = []
-        world_size.times do |rank|
-          reader, writer = IO.pipe
-          pid = fork do
-            reader.close
+        pgid = nil
+        completed = false
+
+        begin
+          world_size.times do |rank|
+            reader, writer = IO.pipe
             begin
-              writer.binmode
-              result = yield(rank, port)
-              Marshal.dump(result, writer)
-              exit! 0
-            rescue => e
-              Marshal.dump({error: "#{e.class}: #{e.message}", backtrace: e.backtrace}, writer)
-              exit! 1
-            ensure
+              case start_method
+              when :fork
+                pids << fork_worker(reader, writer, rank, port, world_size, &block)
+              when :spawn
+                pid, pgid = spawn_worker(reader, writer, rank, port, host: host, world_size: world_size, pgid: pgid)
+                pids << pid
+              else
+                raise ArgumentError, "Unsupported start_method: #{start_method.inspect}"
+              end
+              readers << reader
               writer.close unless writer.closed?
+            rescue Exception
+              reader.close unless reader.closed?
+              writer.close unless writer.closed?
+              raise
             end
           end
-          writer.close
-          readers << reader
-          pids << pid
-        end
 
-        outputs = readers.map do |reader|
-          data = Marshal.load(reader)
-          reader.close
-          data
-        end
+          read_failure = Object.new
 
-        statuses = pids.each_with_index.map do |pid, idx|
-          _pid, status = Process.wait2(pid)
-          [idx, pid, status]
-        end
+          outputs = readers.map do |reader|
+            begin
+              Marshal.load(reader)
+            rescue EOFError
+              read_failure
+            ensure
+              reader.close unless reader.closed?
+            end
+          end
+
+          statuses = pids.each_with_index.map do |pid, idx|
+            _pid, status = Process.wait2(pid)
+            [idx, pid, status]
+          end
 
-        statuses.each do |idx, pid, status|
-          output = outputs[idx]
-          if !status.success? || (output.is_a?(Hash) && output[:error])
-            message = if output.is_a?(Hash) && output[:error]
-              "Child #{pid} failed: #{output[:error]}\n#{Array(output[:backtrace]).join("\n")}"
-            else
-              "Child #{pid} exited with status #{status.exitstatus}"
+          statuses.each do |idx, pid, status|
+            output = outputs[idx]
+            if output.equal?(read_failure)
+              raise Torch::Error, "Child #{pid} closed pipe before sending result (status #{status.exitstatus})"
+            end
+            if !status.success? || (output.is_a?(Hash) && output[:error])
+              message = if output.is_a?(Hash) && output[:error]
+                "Child #{pid} failed: #{output[:error]}\n#{Array(output[:backtrace]).join("\n")}"
+              else
+                "Child #{pid} exited with status #{status.exitstatus}"
+              end
+              raise Torch::Error, message
             end
-            raise Torch::Error, message
           end
-        end
 
-        outputs
+          completed = true
+          outputs
+        ensure
+          # Ensure child workers are cleaned up if an interrupt or error occurs.
+          terminate_processes(pids, pgid: pgid) unless completed
+        end
       end
 
       def free_port(host: "127.0.0.1")
@@ -150,6 +204,44 @@ def ensure_process_group!(group)
         raise Torch::Error, "Default process group is not initialized"
       end
 
+      def default_device_id_for_backend(backend, rank, world_size)
+        return unless backend == "nccl"
+
+        default_local_rank(rank, world_size)
+      end
+
+      def warmup_process_group(pg, backend)
+        return pg unless backend == "nccl"
+
+        # Only warm up when a native process group was returned.
+        # Test helpers may stub out `_init_process_group` and return arbitrary
+        # Ruby objects, which cannot be passed to the C++ bindings.
+        return pg unless pg.nil? || (defined?(Torch::Distributed::ProcessGroup) && pg.is_a?(Torch::Distributed::ProcessGroup))
+
+        # Prime NCCL communicators so the first user-visible collective is fast
+        _barrier(pg)
+        pg
+      rescue
+        _destroy_process_group
+        raise
+      end
+
+      def default_local_rank(rank, world_size)
+        local_rank = env_integer("LOCAL_RANK")
+        return local_rank unless local_rank.nil?
+
+        local_world_size = env_integer("LOCAL_WORLD_SIZE") || world_size
+        return unless local_world_size && rank
+
+        rank % local_world_size if local_world_size.positive?
+      end
+
+      def env_integer(key)
+        Integer(ENV[key]) if ENV.key?(key)
+      rescue ArgumentError
+        nil
+      end
+
       def default_backend_for(device_id)
         get_default_backend_for_device(device_id)
       end
@@ -158,18 +250,174 @@ def device_type_from(device)
         case device
         when Torch::Device
           device.type
+        when NilClass
+          accelerator_type || "cpu"
         when String
           Torch.device(device).type
         when Integer
-          Torch.device("cuda:#{device}").type
-        when NilClass
-          Torch::Accelerator.current_accelerator&.type || "cpu"
+          return accelerator_type || "cpu" if device.negative?
+          if Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:device_count)
+            max = Torch::CUDA.device_count
+            return accelerator_type || "cpu" if max <= 0 || device >= max
+            return Torch.device("cuda:#{device}").type
+          end
+          accelerator_type || "cpu"
         else
+          return device.type if device.respond_to?(:type)
           Torch.device(device).type
         end
       rescue => e
         raise ArgumentError, "Invalid device #{device.inspect}: #{e.message}"
       end
+
+      def accelerator_type
+        acc = Torch::Accelerator.current_accelerator
+        acc.type if acc && acc.respond_to?(:type)
+      rescue
+        nil
+      end
+
+      def normalize_start_method(start_method)
+        method = start_method&.to_sym
+        return method if [:fork, :spawn].include?(method)
+
+        raise ArgumentError, "start_method must be :fork or :spawn (got #{start_method.inspect})"
+      end
+
+      def spawn_worker?
+        ENV[SPAWN_ENV_KEY] == "1"
+      end
+
+      def run_spawn_worker(&block)
+        rank = Integer(ENV.fetch(SPAWN_RANK_ENV_KEY))
+        port = Integer(ENV.fetch(SPAWN_PORT_ENV_KEY))
+        pipe_fd = Integer(ENV.fetch(SPAWN_PIPE_ENV_KEY))
+
+        writer = IO.new(pipe_fd, "wb")
+        writer.binmode
+        writer.sync = true
+
+        result = block.call(rank, port)
+        Marshal.dump(result, writer)
+        writer.flush
+        writer.close
+        Process.exit!(0)
+      rescue Exception => e
+        begin
+          if defined?(writer) && writer && !writer.closed?
+            Marshal.dump({error: "#{e.class}: #{e.message}", backtrace: e.backtrace}, writer)
+            writer.flush
+            writer.close
+          end
+        rescue StandardError
+          # best-effort error reporting back to parent
+        ensure
+          Process.exit!(1)
+          end
+      end
+
+      def fork_worker(reader, writer, rank, port, world_size, &block)
+        fork do
+          reader.close
+          begin
+            ENV["LOCAL_RANK"] = rank.to_s
+            ENV["LOCAL_WORLD_SIZE"] = world_size.to_s
+            ENV["RANK"] = rank.to_s
+            ENV["WORLD_SIZE"] = world_size.to_s
+            writer.binmode
+            writer.sync = true
+            result = block.call(rank, port)
+            Marshal.dump(result, writer)
+            writer.flush
+            writer.close
+            Process.exit!(0)
+          rescue => e
+            Marshal.dump({error: "#{e.class}: #{e.message}", backtrace: e.backtrace}, writer)
+            writer.flush
+            writer.close
+            Process.exit!(1)
+          ensure
+            writer.close unless writer.closed?
+          end
+        end
+      end
+
+      def spawn_worker(reader, writer, rank, port, host:, world_size:, pgid: nil)
+        writer.binmode
+        writer.close_on_exec = false
+
+        script = ENV[SPAWN_SCRIPT_ENV_KEY] || $0
+        env = {
+          SPAWN_ENV_KEY => "1",
+          SPAWN_RANK_ENV_KEY => rank.to_s,
+          SPAWN_WORLD_SIZE_ENV_KEY => world_size.to_s,
+          SPAWN_PORT_ENV_KEY => port.to_s,
+          SPAWN_PIPE_ENV_KEY => writer.fileno.to_s,
+          "LOCAL_RANK" => rank.to_s,
+          "LOCAL_WORLD_SIZE" => world_size.to_s,
+          "MASTER_ADDR" => host,
+          "MASTER_PORT" => port.to_s,
+          "RANK" => rank.to_s,
+          "WORLD_SIZE" => world_size.to_s
+        }
+        env["RUBYLIB"] = [ENV["RUBYLIB"], $LOAD_PATH.join(File::PATH_SEPARATOR)].compact.reject(&:empty?).join(File::PATH_SEPARATOR)
+
+        spawn_opts = {close_others: false}
+        spawn_opts[:pgroup] = pgid ? pgid : true
+
+        pid = Process.spawn(env, RbConfig.ruby, script, *spawn_argv, spawn_opts)
+        pgid ||= pid
+        [pid, pgid]
+      rescue SystemCallError => e
+        raise Torch::Error, "failed to spawn worker #{rank}: #{e.message}"
+      end
+
+      def spawn_argv
+        test_filter = ENV[SPAWN_TEST_ENV_KEY]
+        return SPAWN_ARGV unless test_filter
+        return SPAWN_ARGV if SPAWN_ARGV.include?("-n")
+
+        # Restrict child to the specific test that triggered the spawn
+        SPAWN_ARGV + ["-n", test_filter]
+      end
+
+      def terminate_processes(pids, pgid: nil)
+        return if pids.empty? && !pgid
+
+        send_process_group_signal(pgid, "TERM")
+        pids.each { |pid| safe_kill(pid, "TERM") }
+        sleep(0.2)
+        pids.each do |pid|
+          next unless process_alive?(pid)
+
+          safe_kill(pid, "KILL")
+        end
+        pids.each do |pid|
+          begin
+            Process.wait(pid)
+          rescue Errno::ECHILD
+          end
+        end
+      end
+
+      def send_process_group_signal(pgid, sig)
+        return unless pgid
+
+        Process.kill(sig, -pgid)
+      rescue Errno::ESRCH
+      end
+
+      def safe_kill(pid, sig)
+        Process.kill(sig, pid)
+      rescue Errno::ESRCH
+      end
+
+      def process_alive?(pid)
+        Process.kill(0, pid)
+        true
+      rescue Errno::ESRCH
+        false
+      end
     end
 
     class TCPStore
@@ -193,3 +441,11 @@ def self.new
     end
   end
 end
+
+at_exit do
+  begin
+    Torch::Distributed.destroy_process_group if Torch::Distributed.available? && Torch::Distributed.initialized?
+  rescue Exception
+    # best-effort cleanup to avoid leaked process groups
+  end
+end
diff --git a/lib/torch/nn/parallel/distributed_data_parallel.rb b/lib/torch/nn/parallel/distributed_data_parallel.rb
index 87178f3b..2a1782c8 100644
--- a/lib/torch/nn/parallel/distributed_data_parallel.rb
+++ b/lib/torch/nn/parallel/distributed_data_parallel.rb
@@ -15,7 +15,7 @@ def initialize(mod, device_ids: nil, process_group: nil, broadcast_buffers: true
 
           @world_size = Torch::Distributed.get_world_size(@process_group)
           @rank = Torch::Distributed.get_rank(@process_group)
-          @device = Array(device_ids).compact.first
+          @device = normalize_device(Array(device_ids).compact.first)
           move_to_device(@device) if @device
 
           synchronize_parameters
@@ -38,6 +38,19 @@ def train(mode = true)
 
         private
 
+        def normalize_device(device)
+          return nil unless device
+          return device if device.is_a?(Torch::Device)
+
+          if device.is_a?(Integer)
+            if Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available?
+              return Torch.device("cuda:#{device}")
+            end
+          end
+
+          Torch.device(device)
+        end
+
         def move_to_device(device)
           return unless device
 
@@ -89,10 +102,7 @@ def register_parameter_hooks
           @module.parameters.filter_map do |param|
             next unless param.requires_grad?
 
-            param.register_hook do |grad|
-              Torch::Distributed.all_reduce(grad, group: @process_group)
-              grad.div!(@world_size.to_f)
-            end
+            Torch::Distributed.register_ddp_hook(param, @process_group, @world_size)
           end
         end
       end
diff --git a/lib/torch/tensor.rb b/lib/torch/tensor.rb
index 318f14e0..cfc4b63c 100644
--- a/lib/torch/tensor.rb
+++ b/lib/torch/tensor.rb
@@ -115,7 +115,8 @@ def item
       if numel != 1
         raise Error, "only one element tensors can be converted to Ruby scalars"
       end
-      to_a.first
+      # use flatten to handle tensors with a single element but multiple dimensions
+      to_a.flatten.first
     end
 
     def to_i
diff --git a/lib/torch/torchrun.rb b/lib/torch/torchrun.rb
index b5913334..9155b892 100644
--- a/lib/torch/torchrun.rb
+++ b/lib/torch/torchrun.rb
@@ -203,20 +203,28 @@ def launch_worker_group(restart_count)
 
         status
       ensure
+        @worker_pgid = nil
         @current_pids = []
       end
 
       def spawn_workers(restart_count)
         base_env = base_environment(restart_count)
-        Array.new(@local_world_size) do |local_rank|
+        pgid = nil
+        workers = Array.new(@local_world_size) do |local_rank|
           env = base_env.merge(rank_environment(local_rank))
-          spawn_worker(env, local_rank)
+          pid, pgid = spawn_worker(env, local_rank, pgid)
+          pid
         end
+        @worker_pgid = pgid
+        workers
       end
 
-      def spawn_worker(env, local_rank)
+      def spawn_worker(env, local_rank, pgid)
         args = command_arguments(local_rank)
-        Process.spawn(env, *args)
+        spawn_opts = pgid ? { pgroup: pgid } : { pgroup: true }
+        pid = Process.spawn(env, *args, spawn_opts)
+        pgid ||= pid
+        [pid, pgid]
       rescue SystemCallError => e
         raise Error, "failed to launch worker #{local_rank}: #{e.message}"
       end
@@ -287,6 +295,7 @@ def monitor_workers(pids)
       def terminate_workers(pids)
         return if pids.empty?
 
+        send_process_group_signal("TERM")
         pids.each { |pid| send_signal(pid, "TERM") }
         sleep(0.2)
         pids.each do |pid|
@@ -322,6 +331,7 @@ def setup_signal_handlers
       end
 
       def forward_signal(sig)
+        send_process_group_signal(sig)
         (@current_pids || []).each { |pid| send_signal(pid, sig) }
       end
 
@@ -339,6 +349,14 @@ def send_signal(pid, sig)
         nil
       end
 
+      def send_process_group_signal(sig)
+        return unless @worker_pgid
+
+        Process.kill(sig, -@worker_pgid)
+      rescue Errno::ESRCH
+        nil
+      end
+
       def cleanup_workers(pids)
         pids.each do |pid|
           next unless process_alive?(pid)
diff --git a/test/distributed_test.rb b/test/distributed_test.rb
index 487f3af0..659a4930 100644
--- a/test/distributed_test.rb
+++ b/test/distributed_test.rb
@@ -1,20 +1,149 @@
 require_relative "test_helper"
+require "torch/distributed"
 require "socket"
 
-class DistributedTest < Minitest::Test
+class DistributedInitProcessGroupTest < Minitest::Test
+  def setup
+    skip "Distributed backend not available" unless Torch::Distributed.available?
+  end
+
+  def test_defaults_nccl_device_id_from_local_rank_env
+    calls = []
+    with_stubbed_init_process_group(calls) do
+      ENV["LOCAL_RANK"] = "2"
+      Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 5, world_size: 8)
+    ensure
+      ENV.delete("LOCAL_RANK")
+    end
+
+    assert_equal 1, calls.size
+    assert_equal 2, calls.first[:device_id]
+  end
+
+  def test_falls_back_to_local_world_size_modulo
+    calls = []
+    with_stubbed_init_process_group(calls) do
+      ENV["LOCAL_WORLD_SIZE"] = "2"
+      Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 3, world_size: 4)
+    ensure
+      ENV.delete("LOCAL_WORLD_SIZE")
+    end
+
+    assert_equal 1, calls.size
+    assert_equal 1, calls.first[:device_id]
+  end
+
+  def test_uses_world_size_when_env_missing
+    calls = []
+    with_stubbed_init_process_group(calls) do
+      Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 1, world_size: 2)
+    end
+
+    assert_equal 1, calls.size
+    assert_equal 1, calls.first[:device_id]
+  end
+
+  private
+
+  def with_stubbed_init_process_group(calls)
+    original = Torch::Distributed.method(:_init_process_group)
+    Torch::Distributed.singleton_class.define_method(:_init_process_group) do |backend, store, rank, world_size, timeout_ms, device_id|
+      calls << {backend: backend, rank: rank, world_size: world_size, timeout_ms: timeout_ms, device_id: device_id}
+      :stub
+    end
+    yield
+  ensure
+    Torch::Distributed.singleton_class.define_method(:_init_process_group, original)
+  end
+end
+
+class DistributedSpawnStartMethodTest < Minitest::Test
+  def test_spawn_worker_env_runs_block
+    reader, writer = IO.pipe
+    writer.close_on_exec = false
+
+    pid = fork do
+      reader.close
+      ENV[Torch::Distributed::SPAWN_ENV_KEY] = "1"
+      ENV[Torch::Distributed::SPAWN_RANK_ENV_KEY] = "0"
+      ENV[Torch::Distributed::SPAWN_WORLD_SIZE_ENV_KEY] = "1"
+      ENV[Torch::Distributed::SPAWN_PORT_ENV_KEY] = "1234"
+      ENV[Torch::Distributed::SPAWN_PIPE_ENV_KEY] = writer.fileno.to_s
+      Torch::Distributed.fork_world(1, start_method: :spawn) { |rank, port| [rank, port] }
+    end
+
+    writer.close
+    result = Marshal.load(reader)
+    reader.close
+
+    _pid, status = Process.wait2(pid)
+    assert status.success?
+    assert_equal [0, 1234], result
+  end
+end
+
+class DistributedBackendTest < Minitest::Test
+  BACKEND = nil
+
   def setup
     super
     skip "Distributed backend not available" unless Torch::Distributed.available?
+    skip "No backend configured for test" unless backend
+    skip_unless_backend_available!
   end
 
-  def test_all_reduce
-    results = Torch::Distributed.fork_world(2) do |rank, port|
-      store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?)
-      Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2)
+  def backend
+    self.class::BACKEND
+  end
+
+  def tensor_options
+    {}
+  end
+
+  def skip_unless_backend_available!
+    skip "#{backend} backend not available" unless backend_available?
+  end
+
+  def backend_available?
+    port = Torch::Distributed.free_port
+    store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 1, true, wait_for_workers: false)
+    Torch::Distributed.init_process_group(backend, store: store, rank: 0, world_size: 1)
+    true
+  rescue StandardError => e
+    return false if e.message =~ /not available/i || e.message =~ /unsupported backend/i
+    raise
+  ensure
+    Torch::Distributed.destroy_process_group if Torch::Distributed.initialized?
+  end
+
+  def nccl_device_id(rank)
+    rank
+  end
 
-      tensor = Torch.tensor([rank + 1.0])
+  def fork_with_backend(world_size: 2, start_method: :fork)
+    original_filter = ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY]
+    original_script = ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY]
+    ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = name if start_method == :spawn
+    ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = File.expand_path(__FILE__) if start_method == :spawn
+    Torch::Distributed.fork_world(world_size, start_method: start_method) do |rank, port|
+      store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?)
+      device_id = backend == "nccl" ? nccl_device_id(rank) : nil
+      Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size, device_id: device_id)
+      begin
+        yield(rank)
+      ensure
+        Torch::Distributed.destroy_process_group
+      end
+    end
+  ensure
+    ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = original_filter
+    ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = original_script
+  end
+
+  def test_all_reduce
+    results = fork_with_backend do |rank|
+      tensor = Torch.tensor([rank + 1.0], **tensor_options)
       Torch::Distributed.all_reduce(tensor)
-      Torch::Distributed.destroy_process_group
       tensor.to_a
     end
 
@@ -22,15 +151,11 @@ def test_all_reduce
   end
 
   def test_barrier
-    wait_times = Torch::Distributed.fork_world(2) do |rank, port|
-      store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?)
-      Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2)
-
+    wait_times = fork_with_backend do |rank|
       sleep 0.3 if rank.zero?
       before = Process.clock_gettime(Process::CLOCK_MONOTONIC)
       Torch::Distributed.barrier
       after = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-      Torch::Distributed.destroy_process_group
       after - before
     end
 
@@ -39,13 +164,9 @@ def test_barrier
   end
 
   def test_broadcast
-    tensors = Torch::Distributed.fork_world(2) do |rank, port|
-      store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?)
-      Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2)
-
-      tensor = Torch.tensor([rank + 1.0])
+    tensors = fork_with_backend do |rank|
+      tensor = Torch.tensor([rank + 1.0], **tensor_options)
       Torch::Distributed.broadcast(tensor, src: 0)
-      Torch::Distributed.destroy_process_group
       tensor.to_a
     end
 
@@ -53,25 +174,45 @@ def test_broadcast
   end
 
   def test_ddp_gradient_sync
-    grads = Torch::Distributed.fork_world(2) do |rank, port|
-      store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 2, rank.zero?)
-      Torch::Distributed.init_process_group("gloo", store: store, rank: rank, world_size: 2)
-
+    grads = fork_with_backend do |rank|
+      device = tensor_options[:device]
       model = Torch::NN::Linear.new(1, 1, bias: false)
+      model = model.to(device) if device
       ddp = Torch::NN::Parallel::DistributedDataParallel.new(model)
-      input = Torch.tensor([[rank + 1.0]])
+      input = Torch.tensor([[rank + 1.0]], **tensor_options)
       output = ddp.call(input)
       loss = output.sum
       loss.backward
 
-      grad = model.parameters.first.grad.item
-      Torch::Distributed.destroy_process_group
-      grad
+      grad = model.parameters.first.grad
+      grad = grad.to("cpu") if device
+      grad.item
     end
 
     grads.each do |grad|
       assert_in_delta 1.5, grad, 1e-6
     end
   end
+end
+
+class DistributedGlooTest < DistributedBackendTest
+  BACKEND = "gloo"
+end
+
+class DistributedNcclTest < DistributedBackendTest
+  BACKEND = "nccl"
+
+  def setup
+    skip "CUDA not available for NCCL backend" unless Torch.const_defined?(:CUDA) && Torch::CUDA.available?
+    skip "Need at least 2 CUDA devices for NCCL tests" unless Torch::CUDA.device_count >= 2
+    super
+  end
 
+  def tensor_options
+    {device: "cuda"}
+  end
+
+  def fork_with_backend(world_size: 2, start_method: :spawn)
+    super(world_size: world_size, start_method: start_method)
+  end
 end
diff --git a/test/test_helper.rb b/test/test_helper.rb
index 347bbb4f..76f913cf 100644
--- a/test/test_helper.rb
+++ b/test/test_helper.rb
@@ -1,7 +1,33 @@
+spawn_worker = ENV["TORCH_DISTRIBUTED_SPAWNED"] == "1"
+
+# Spawned distributed workers shouldn't try to load minitest plugins from the
+# parent test environment.
+ENV["MT_NO_PLUGINS"] = "1" if spawn_worker
+
 require "bundler/setup"
 Bundler.require(:default)
 require "minitest/autorun"
 
+if spawn_worker
+  module TorchDistributedSpawnTest
+    module QuietSummaryReporter
+      def start # :nodoc:
+        Minitest::StatisticsReporter.instance_method(:start).bind(self).call
+        self.sync = io.respond_to?(:"sync=")
+        self.old_sync, io.sync = io.sync, true if self.sync
+      end
+
+      def report # :nodoc:
+        super
+      ensure
+        io.sync = self.old_sync if self.sync
+      end
+    end
+  end
+
+  Minitest::SummaryReporter.prepend(TorchDistributedSpawnTest::QuietSummaryReporter)
+end
+
 # support
 require_relative "support/net"
 

From 0286f2a1b8bf96e90f2d93756c121c95d9e67db5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i@orlando-labs.com>
Date: Wed, 19 Nov 2025 22:51:55 +0300
Subject: [PATCH 23/28] possible fix for non-cuda c10

---
 ext/torch/cuda.cpp   | 32 ++++++++++++++++++++++++--------
 ext/torch/extconf.rb | 24 ++++++++++++++++++++----
 2 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/ext/torch/cuda.cpp b/ext/torch/cuda.cpp
index 69b2529f..5a7c52e4 100644
--- a/ext/torch/cuda.cpp
+++ b/ext/torch/cuda.cpp
@@ -1,20 +1,36 @@
 #include <torch/torch.h>
+#ifdef HAVE_C10_CUDA
 #include <c10/cuda/CUDAFunctions.h>
+#endif
 
 #include <rice/rice.hpp>
 
 #include "utils.h"
 
 void init_cuda(Rice::Module& m) {
-  Rice::define_module_under(m, "CUDA")
+  auto rb_mCUDA = Rice::define_module_under(m, "CUDA");
+
+  rb_mCUDA
     .define_singleton_function("available?", &torch::cuda::is_available)
     .define_singleton_function("device_count", &torch::cuda::device_count)
     .define_singleton_function("manual_seed", &torch::cuda::manual_seed)
-    .define_singleton_function("manual_seed_all", &torch::cuda::manual_seed_all)
-    .define_singleton_function(
-        "set_device",
-        [](int device_id) {
-          c10::cuda::set_device(device_id);
-          return Rice::Nil;
-        });
+    .define_singleton_function("manual_seed_all", &torch::cuda::manual_seed_all);
+
+#ifdef HAVE_C10_CUDA
+  rb_mCUDA.define_singleton_function(
+      "set_device",
+      [](int device_id) {
+        c10::cuda::set_device(device_id);
+        return Rice::Nil;
+      });
+#else
+  rb_mCUDA.define_singleton_function(
+      "set_device",
+      [](int) {
+        rb_raise(
+            rb_eRuntimeError,
+            "c10 CUDA support is not available in this build; set_device cannot be used");
+        return Rice::Nil;
+      });
+#endif
 }
diff --git a/ext/torch/extconf.rb b/ext/torch/extconf.rb
index 2cadd1c9..d30f4982 100644
--- a/ext/torch/extconf.rb
+++ b/ext/torch/extconf.rb
@@ -55,6 +55,25 @@
 $INCFLAGS += " -I#{inc}"
 $INCFLAGS += " -I#{inc}/torch/csrc/api/include"
 
+CONFIG["CC"] = CONFIG["CXX"]
+$CFLAGS = $CXXFLAGS
+
+supports_c10_cuda = with_cuda && try_compile(<<~CPP)
+  #include <torch/torch.h>
+  #include <c10/cuda/CUDAFunctions.h>
+
+  int main() {
+    c10::cuda::set_device(0);
+    return 0;
+  }
+CPP
+
+unless supports_c10_cuda
+  puts "c10 CUDA headers not available; features that require them will be disabled"
+else
+  $defs << " -DHAVE_C10_CUDA"
+end
+
 $LDFLAGS += " -Wl,-rpath,#{lib}"
 if RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i && Dir.exist?("/opt/homebrew/opt/libomp/lib")
   $LDFLAGS += ",-rpath,/opt/homebrew/opt/libomp/lib"
@@ -71,9 +90,6 @@
   $LDFLAGS += " -Wl,--no-as-needed,#{lib}/libtorch.so"
 end
 
-CONFIG["CC"] = CONFIG["CXX"]
-$CFLAGS = $CXXFLAGS
-
 supports_c10d = try_link(<<~CPP, "-DUSE_C10D")
   #include <torch/torch.h>
   #include <torch/csrc/distributed/c10d/FileStore.hpp>
@@ -98,7 +114,7 @@
   }
 CPP
 
-supports_c10d_nccl = with_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
+supports_c10d_nccl = with_cuda && supports_c10_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
   #include <torch/torch.h>
   #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
 

From 4276d638ff8ca8b8c52310c45b3060c3c4c73ef4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i@orlando-labs.com>
Date: Wed, 19 Nov 2025 23:44:38 +0300
Subject: [PATCH 24/28] added missing const_cast

---
 ext/torch/tensor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/torch/tensor.cpp b/ext/torch/tensor.cpp
index d5fb0bc3..353ae7cd 100644
--- a/ext/torch/tensor.cpp
+++ b/ext/torch/tensor.cpp
@@ -46,7 +46,7 @@ struct RubyTensorHook {
     // ruby_init_stack is idempotent and safe to call repeatedly; it ensures the
     // current native thread is known to the VM before we try to grab the GVL.
     volatile VALUE stack_anchor = Qnil;
-    ruby_init_stack(&stack_anchor);
+    ruby_init_stack(const_cast<VALUE*>(&stack_anchor));
   }
 
   ~RubyTensorHook() {

From 51785c99bb53ef3ffcdd498aa7e54f1fca9f639c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i@orlando-labs.com>
Date: Wed, 19 Nov 2025 23:44:54 +0300
Subject: [PATCH 25/28] skipping cuda tests when c10 for nccl in not available

---
 test/distributed_test.rb | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/distributed_test.rb b/test/distributed_test.rb
index 659a4930..90c82e17 100644
--- a/test/distributed_test.rb
+++ b/test/distributed_test.rb
@@ -5,6 +5,7 @@
 class DistributedInitProcessGroupTest < Minitest::Test
   def setup
     skip "Distributed backend not available" unless Torch::Distributed.available?
+    skip "CUDA not available for NCCL backend" unless cuda_available?
   end
 
   def test_defaults_nccl_device_id_from_local_rank_env
@@ -55,6 +56,10 @@ def with_stubbed_init_process_group(calls)
   ensure
     Torch::Distributed.singleton_class.define_method(:_init_process_group, original)
   end
+
+  def cuda_available?
+    Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available?
+  end
 end
 
 class DistributedSpawnStartMethodTest < Minitest::Test

From e63f784d12d55452a54d21348a43d9a93895522b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i@orlando-labs.com>
Date: Fri, 21 Nov 2025 17:48:07 +0300
Subject: [PATCH 26/28] GLOO support

---
 README.md                                     |  38 ++++
 examples/benchmark/training.rb                | 213 ++++++++++++++++--
 ext/torch/distributed.cpp                     |   1 +
 ext/torch/extconf.rb                          |  30 ++-
 lib/torch/device.rb                           |   5 +-
 lib/torch/distributed.rb                      |   5 +-
 .../nn/parallel/distributed_data_parallel.rb  |  14 +-
 test/device_test.rb                           |   5 +
 test/distributed_test.rb                      |  52 +++--
 9 files changed, 306 insertions(+), 57 deletions(-)

diff --git a/README.md b/README.md
index 78934193..72161779 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,11 @@ Then run:
 bundle config build.torch-rb --with-torch-dir=/path/to/libtorch
 ```
 
+In order to build distributed features (if your LibTorch supports it) add the following to the build config string:
+```sh
+... --with-cuda-include=/path/to/cuda/include --with-gloo-include=/path/to/gloo/repo
+```
+
 And add this line to your application’s Gemfile:
 
 ```ruby
@@ -95,6 +100,39 @@ On node 1, change `--node-rank=1`. The launcher restarts workers up to `--max-re
 
 For scripts that use the `Torch::Distributed.fork_world` helper directly, set `start_method: :spawn` to launch fresh worker processes instead of forking. This matches Python’s multiprocessing start methods and avoids CUDA fork issues.
 
+### Distributed benchmark
+
+Generate a comparison table across backends, group sizes, and batch sizes:
+
+```sh
+bundle exec ruby examples/benchmark/training.rb --backends gloo,nccl --batch-sizes 32,64,128,256 --gpus 2 --steps 50
+```
+
+Example results on dual RTX 3090s:
+Processing speed: images per second. Convergence speed: average loss reduction per step and per second.
+
+```text
+Backend | Proc Group | Batch | Images/s |
+--------+------------+-------+----------|
+gloo    | 1          | 32    | 1724.4   |
+gloo    | 1          | 64    | 1941.8   |
+gloo    | 1          | 128   | 2038.7   |
+gloo    | 1          | 256   | 2171.8   |
+gloo    | 2          | 32    | 2261.0   |
+gloo    | 2          | 64    | 2870.6   |
+gloo    | 2          | 128   | 3398.4   |
+gloo    | 2          | 256   | 3743.1   |
+nccl    | 1          | 32    | 1804.8   |
+nccl    | 1          | 64    | 1963.0   |
+nccl    | 1          | 128   | 2051.5   |
+nccl    | 1          | 256   | 2143.3   |
+nccl    | 2          | 32    | 3046.1   |
+nccl    | 2          | 64    | 3513.6   |
+nccl    | 2          | 128   | 3892.1   |
+nccl    | 2          | 256   | 4024.5   |
+--------+------------+-------+----------|
+```
+
 ## API
 
 This library follows the [PyTorch API](https://pytorch.org/docs/stable/torch.html). There are a few changes to make it more Ruby-like:
diff --git a/examples/benchmark/training.rb b/examples/benchmark/training.rb
index 090018f6..83b85520 100644
--- a/examples/benchmark/training.rb
+++ b/examples/benchmark/training.rb
@@ -13,6 +13,70 @@
 else
   Torch::Distributed.get_default_backend_for_device(Torch::Accelerator.current_accelerator) || "gloo"
 end
+SPAWN_BACKEND_ENV = "TORCH_RB_BENCH_BACKEND".freeze
+SPAWN_GROUP_ENV = "TORCH_RB_BENCH_GROUP_SIZE".freeze
+SPAWN_BATCH_ENV = "TORCH_RB_BENCH_BATCH_SIZE".freeze
+
+def parse_list(value)
+  value.split(",").map(&:strip).reject(&:empty?)
+end
+
+def backend_supported?(backend)
+  return true unless backend == "nccl"
+
+  Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available?
+end
+
+def usable_cuda_device_count
+  return 0 unless Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available?
+
+  Torch::CUDA.respond_to?(:device_count) ? Torch::CUDA.device_count : 0
+rescue
+  0
+end
+
+def spawn_worker_process?
+  ENV[Torch::Distributed::SPAWN_ENV_KEY] == "1"
+end
+
+def apply_spawn_overrides!(options)
+  return unless ENV[Torch::Distributed::SPAWN_ENV_KEY] == "1"
+
+  if ENV[SPAWN_BACKEND_ENV]
+    options[:backends] = [ENV[SPAWN_BACKEND_ENV]]
+  end
+
+  if ENV[SPAWN_GROUP_ENV]
+    group_size = ENV[SPAWN_GROUP_ENV].to_i
+    if group_size.positive?
+      options[:group_sizes] = [group_size]
+      options[:gpus] = group_size
+    end
+  end
+
+  if ENV[SPAWN_BATCH_ENV]
+    batch_size = ENV[SPAWN_BATCH_ENV].to_i
+    options[:batch_sizes] = [batch_size] if batch_size.positive?
+  end
+end
+
+def with_spawn_env(backend:, group_size:, batch_size:)
+  previous = {
+    SPAWN_BACKEND_ENV => ENV[SPAWN_BACKEND_ENV],
+    SPAWN_GROUP_ENV => ENV[SPAWN_GROUP_ENV],
+    SPAWN_BATCH_ENV => ENV[SPAWN_BATCH_ENV]
+  }
+
+  ENV[SPAWN_BACKEND_ENV] = backend
+  ENV[SPAWN_GROUP_ENV] = group_size.to_s
+  ENV[SPAWN_BATCH_ENV] = batch_size.to_s
+
+  yield
+ensure
+  ENV[SPAWN_BACKEND_ENV] = previous[SPAWN_BACKEND_ENV]
+  ENV[SPAWN_GROUP_ENV] = previous[SPAWN_GROUP_ENV]
+  ENV[SPAWN_BATCH_ENV] = previous[SPAWN_BATCH_ENV]
+end
 
 class MnistCnn < Torch::NN::Module
   def initialize
@@ -47,11 +111,12 @@ def forward(x)
 def parse_options
   defaults = {
     arch: "mnist_cnn",
-    batch_size: 128,
+    batch_sizes: [128],
     steps: 100,
     warmup: 10,
-    backend: DEFAULT_BACKEND,
+    backends: [DEFAULT_BACKEND],
     gpus: Torch::CUDA.available? ? [Torch::CUDA.device_count, 1].max : 1,
+    group_sizes: nil,
     data_dir: File.join(__dir__, "data"),
     lr: 0.01
   }
@@ -59,15 +124,19 @@ def parse_options
   OptionParser.new do |opts|
     opts.banner = "Usage: ruby examples/benchmark/training.rb [options]"
     opts.on("--arch NAME", "Architecture to benchmark (#{ARCH_CONFIGS.keys.join(', ')}, default: #{defaults[:arch]})") { |v| defaults[:arch] = v }
-    opts.on("--batch-size N", Integer, "Batch size per process (default: #{defaults[:batch_size]})") { |v| defaults[:batch_size] = v }
+    opts.on("--batch-size N", Integer, "Batch size per process (default: #{defaults[:batch_sizes].first})") { |v| defaults[:batch_sizes] = [v] }
+    opts.on("--batch-sizes LIST", String, "Comma-separated batch sizes per process") { |v| defaults[:batch_sizes] = parse_list(v).map(&:to_i) }
     opts.on("--steps N", Integer, "Number of timed training steps (default: #{defaults[:steps]})") { |v| defaults[:steps] = v }
     opts.on("--warmup N", Integer, "Number of warmup steps not included in timing (default: #{defaults[:warmup]})") { |v| defaults[:warmup] = v }
-    opts.on("--backend NAME", String, "Process group backend (default: #{defaults[:backend]})") { |v| defaults[:backend] = v }
+    opts.on("--backend NAME", String, "Process group backend (default: #{defaults[:backends].first})") { |v| defaults[:backends] = [v] }
+    opts.on("--backends LIST", String, "Comma-separated list of backends to benchmark (gloo,nccl)") { |v| defaults[:backends] = parse_list(v) }
     opts.on("--gpus N", Integer, "Number of GPUs/processes to use (1 for non-distributed)") { |v| defaults[:gpus] = v }
+    opts.on("--group-sizes LIST", String, "Process group sizes to benchmark (default: 1..gpus)") { |v| defaults[:group_sizes] = parse_list(v).map(&:to_i) }
     opts.on("--data-dir PATH", String, "Directory for cached datasets (default: #{defaults[:data_dir]})") { |v| defaults[:data_dir] = v }
     opts.on("--lr FLOAT", Float, "Learning rate (default: #{defaults[:lr]})") { |v| defaults[:lr] = v }
   end.parse!(ARGV)
 
+  defaults[:group_sizes] ||= (1..defaults[:gpus]).to_a
   defaults
 end
 
@@ -110,15 +179,16 @@ def benchmark_worker(rank, world_size, port, options)
   raise ArgumentError, "Unsupported architecture #{arch.inspect}" unless config
 
   distributed = world_size > 1
+  accelerator = Torch::Accelerator.current_accelerator
+  selected_backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator) || DEFAULT_BACKEND
   if distributed
     store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?)
-    accelerator = Torch::Accelerator.current_accelerator
-    backend = options[:backend] || Torch::Distributed.get_default_backend_for_device(accelerator) || DEFAULT_BACKEND
-    Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size)
+    Torch::Distributed.init_process_group(selected_backend, store: store, rank: rank, world_size: world_size)
   end
 
-  device = if Torch::CUDA.available? && options[:gpus] > 0
-    Torch.device("cuda:#{rank % Torch::CUDA.device_count}")
+  cuda_devices = usable_cuda_device_count
+  device = if cuda_devices.positive? && options[:gpus] > 0
+    Torch.device("cuda:#{rank % cuda_devices}")
   else
     Torch.device("cpu")
   end
@@ -139,7 +209,9 @@ def benchmark_worker(rank, world_size, port, options)
   warmup_steps = options[:warmup]
   timed_steps = options[:steps]
   total_steps = warmup_steps + timed_steps
+  losses = []
 
+  # Warm up the model (including one full timed-length pass) to avoid init overhead in measurements.
   step_idx = 0
   loader.each do |data, target|
     data = data.to(device)
@@ -169,6 +241,14 @@ def benchmark_worker(rank, world_size, port, options)
     loss.backward
     optimizer.step
 
+    loss_value = loss.item
+    if distributed
+      loss_tensor = Torch.tensor([loss_value], device: device)
+      Torch::Distributed.all_reduce(loss_tensor)
+      loss_value = loss_tensor.item / world_size.to_f
+    end
+    losses << loss_value if !distributed || rank.zero?
+
     step_idx += 1
     break if step_idx >= timed_steps
   end
@@ -178,30 +258,115 @@ def benchmark_worker(rank, world_size, port, options)
   elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
   timed = step_idx
 
-  if rank.zero?
-    images = timed * options[:batch_size] * world_size
-    puts "Architecture: #{arch}"
-    puts "Dataset: #{config[:dataset]}"
-    puts "GPUs: #{world_size}"
-    puts "Batch size per process: #{options[:batch_size]}"
-    puts "Timed steps: #{timed}"
-    puts "Total images: #{images}"
-    puts format("Elapsed: %.3fs | Throughput: %.1f images/s", elapsed, images / elapsed)
+  images = timed * options[:batch_size] * world_size
+  throughput = elapsed.positive? ? images.to_f / elapsed : 0.0
+  initial_loss = losses.first || 0.0
+  final_loss = losses.last || initial_loss
+  loss_delta = initial_loss - final_loss
+  loss_delta_per_step = timed.zero? ? 0.0 : loss_delta / timed
+  loss_delta_per_sec = elapsed.zero? ? 0.0 : loss_delta / elapsed
+
+  result = if !distributed || rank.zero?
+    {
+      backend: selected_backend,
+      world_size: world_size,
+      batch_size: options[:batch_size],
+      arch: arch,
+      dataset: config[:dataset],
+      elapsed: elapsed,
+      timed_steps: timed,
+      images: images,
+      throughput: throughput,
+      initial_loss: initial_loss,
+      final_loss: final_loss,
+      loss_delta: loss_delta,
+      loss_delta_per_step: loss_delta_per_step,
+      loss_delta_per_sec: loss_delta_per_sec
+    }
   end
 
   Torch::Distributed.destroy_process_group if distributed
+  result
+end
+
+def run_benchmark_case(world_size, options)
+  if world_size > 1
+    outputs = Torch::Distributed.fork_world(world_size, start_method: :spawn) do |rank, port|
+      benchmark_worker(rank, world_size, port, options)
+    end
+    outputs.compact.first
+  else
+    benchmark_worker(0, 1, Torch::Distributed.free_port, options)
+  end
+end
+
+def print_summary_table(results)
+  puts "\nBenchmark comparison (processing vs convergence)"
+  puts "Processing speed: images per second. Convergence speed: average loss reduction per step and per second.\n"
+
+  headers = ["Backend", "Proc Group", "Batch", "Images/s", "Loss delta/step", "Loss delta/s", "Final loss"]
+  formatters = [
+    ->(r) { r[:backend] },
+    ->(r) { r[:world_size] },
+    ->(r) { r[:batch_size] },
+    ->(r) { format("%.1f", r[:throughput]) },
+    ->(r) { format("%.4f", r[:loss_delta_per_step]) },
+    ->(r) { format("%.4f", r[:loss_delta_per_sec]) },
+    ->(r) { format("%.4f", r[:final_loss]) }
+  ]
+
+  widths = headers.each_with_index.map do |header, idx|
+    [header.length, results.map { |r| formatters[idx].call(r).to_s.length }.max].compact.max
+  end
+
+  header_line = headers.each_with_index.map { |h, idx| h.ljust(widths[idx]) }.join(" | ")
+  divider = widths.map { |w| "-" * w }.join("-+-")
+  puts header_line
+  puts divider
+
+  results.sort_by { |r| [r[:backend], r[:world_size], r[:batch_size]] }.each do |result|
+    row = formatters.each_with_index.map { |formatter, idx| formatter.call(result).to_s.ljust(widths[idx]) }
+    puts row.join(" | ")
+  end
 end
 
 options = parse_options
-world_size = options[:gpus]
-raise "Number of GPUs requested must be >= 1" if world_size < 1
+apply_spawn_overrides!(options)
+max_world_size = options[:gpus]
+raise "Number of GPUs requested must be >= 1" if max_world_size < 1
 Torch.manual_seed(1)
 
-if world_size > 1
+group_sizes = options[:group_sizes].map { |v| [v, max_world_size].min }.select { |v| v >= 1 }.uniq.sort
+batch_sizes = options[:batch_sizes].map { |v| [v, 1].max }.uniq
+backends = options[:backends].map(&:downcase).uniq
+
+if group_sizes.any? { |size| size > 1 }
   raise "torch.distributed is not available" unless Torch::Distributed.available?
-  Torch::Distributed.fork_world(world_size, start_method: :spawn) do |rank, port|
-    benchmark_worker(rank, world_size, port, options)
+end
+
+results = []
+
+backends.each do |backend|
+  unless backend_supported?(backend)
+    warn "Skipping backend=#{backend} because required accelerator support is unavailable."
+    next
   end
+
+  group_sizes.each do |world_size|
+    batch_sizes.each do |batch_size|
+      run_options = options.merge(batch_size: batch_size, backend: backend, gpus: world_size)
+      puts "Running backend=#{backend}, group_size=#{world_size}, batch_size=#{batch_size}..." unless spawn_worker_process?
+      with_spawn_env(backend: backend, group_size: world_size, batch_size: batch_size) do
+        results << run_benchmark_case(world_size, run_options)
+      end
+    end
+  end
+end
+
+results.compact!
+
+if results.empty?
+  puts "No benchmark results to report."
 else
-  benchmark_worker(0, 1, Torch::Distributed.free_port, options)
+  print_summary_table(results)
 end
diff --git a/ext/torch/distributed.cpp b/ext/torch/distributed.cpp
index b3a22bc3..a3d50680 100644
--- a/ext/torch/distributed.cpp
+++ b/ext/torch/distributed.cpp
@@ -198,6 +198,7 @@ void init_distributed(Rice::Module& m) {
         } else if (backend_lower == "nccl") {
 #if defined(USE_C10D_NCCL)
           auto options = c10::make_intrusive<::c10d::ProcessGroupNCCL::Options>();
+          options->timeout = std::chrono::milliseconds(timeout_millis);
           pg = c10::make_intrusive<::c10d::ProcessGroupNCCL>(store, rank, world_size, options);
 #else
           rb_raise(rb_eRuntimeError, "NCCL backend is not available in this build");
diff --git a/ext/torch/extconf.rb b/ext/torch/extconf.rb
index d30f4982..916588d9 100644
--- a/ext/torch/extconf.rb
+++ b/ext/torch/extconf.rb
@@ -38,6 +38,9 @@
 cudnn_inc, cudnn_lib = dir_config("cudnn")
 cudnn_lib ||= "/usr/local/cuda/lib"
 
+gloo_inc, _ = dir_config("gloo")
+gloo_inc ||= "./vendor/gloo"
+
 $LDFLAGS += " -L#{lib}" if Dir.exist?(lib)
 abort "LibTorch not found" unless have_library("torch")
 
@@ -68,9 +71,7 @@
   }
 CPP
 
-unless supports_c10_cuda
-  puts "c10 CUDA headers not available; features that require them will be disabled"
-else
+if supports_c10_cuda
   $defs << " -DHAVE_C10_CUDA"
 end
 
@@ -100,6 +101,19 @@
   }
 CPP
 
+if supports_c10d
+  $defs << " -DUSE_C10D"
+  puts "Building with distributed support"
+
+  if find_header("gloo/algorithm.h", gloo_inc)
+    $INCFLAGS += " -I#{gloo_inc}"
+  else
+    puts "GLOO headers not found. Consider setting --with-gloo-include param"
+  end
+else
+  puts "Building without distributed support"
+end
+
 supports_c10d_gloo = supports_c10d && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_GLOO")
   #include <torch/torch.h>
   #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
@@ -125,17 +139,13 @@
   }
 CPP
 
-if supports_c10d
-  $defs << " -DUSE_C10D"
-  puts "Building with distributed support"
-else
-  puts "Building without distributed support"
-end
-
 if supports_c10d_gloo
   $defs << "-DUSE_C10D_GLOO"
   puts "GLOO support detected"
 end
+unless supports_c10_cuda
+  puts "No c10 CUDA headers found. NCCL is unavailable"
+end
 if supports_c10d_nccl
   $defs << "-DUSE_C10D_NCCL"
   puts "NCCL support detected"
diff --git a/lib/torch/device.rb b/lib/torch/device.rb
index f80868ff..0621e463 100644
--- a/lib/torch/device.rb
+++ b/lib/torch/device.rb
@@ -8,7 +8,10 @@ def inspect
       extra = ", index: #{index.inspect}" if index?
       "device(type: #{type.inspect}#{extra})"
     end
-    alias_method :to_s, :inspect
+
+    def to_s
+      _str
+    end
 
     def ==(other)
       eql?(other)
diff --git a/lib/torch/distributed.rb b/lib/torch/distributed.rb
index 7e1e739d..d69e0396 100644
--- a/lib/torch/distributed.rb
+++ b/lib/torch/distributed.rb
@@ -53,7 +53,10 @@ def init_process_group(backend = nil, init_method: "env://", store: nil, rank: n
         timeout_ms = (timeout * 1000).to_i
         bound_device_id = device_id.nil? ? -1 : Integer(device_id)
         if backend == "nccl" && bound_device_id >= 0 && Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:set_device)
-          Torch::CUDA.set_device(bound_device_id)
+          device_count = Torch::CUDA.device_count if Torch::CUDA.respond_to?(:device_count)
+          # Only attempt to switch devices when the requested id exists to avoid
+          # raising on hosts with fewer GPUs than the provided local rank.
+          Torch::CUDA.set_device(bound_device_id) if device_count.nil? || bound_device_id < device_count
         end
         pg = _init_process_group(backend, store, rank, world_size, timeout_ms, bound_device_id)
         warmup_process_group(pg, backend)
diff --git a/lib/torch/nn/parallel/distributed_data_parallel.rb b/lib/torch/nn/parallel/distributed_data_parallel.rb
index 2a1782c8..dd5e0245 100644
--- a/lib/torch/nn/parallel/distributed_data_parallel.rb
+++ b/lib/torch/nn/parallel/distributed_data_parallel.rb
@@ -84,17 +84,21 @@ def move_value(value, device)
 
         def synchronize_parameters
           Torch::Distributed.barrier(group: @process_group)
-          @module.parameters.each do |param|
-            Torch::Distributed.broadcast(param, src: 0, group: @process_group)
+          Torch.no_grad do
+            @module.parameters.each do |param|
+              Torch::Distributed.broadcast(param, src: 0, group: @process_group)
+            end
+            broadcast_buffers_if_needed
           end
-          broadcast_buffers_if_needed
         end
 
         def broadcast_buffers_if_needed
           return unless @broadcast_buffers
 
-          @module.buffers.each do |buffer|
-            Torch::Distributed.broadcast(buffer, src: 0, group: @process_group)
+          Torch.no_grad do
+            @module.buffers.each do |buffer|
+              Torch::Distributed.broadcast(buffer, src: 0, group: @process_group)
+            end
           end
         end
 
diff --git a/test/device_test.rb b/test/device_test.rb
index 69f778f6..b31b3348 100644
--- a/test/device_test.rb
+++ b/test/device_test.rb
@@ -22,4 +22,9 @@ def test_inspect
     assert_equal %!device(type: "cpu")!, Torch.device("cpu").inspect
     assert_equal %!device(type: "cpu", index: 0)!, Torch.device("cpu:0").inspect
   end
+
+  def test_to_s
+    assert_equal "cpu", Torch.device("cpu").to_s
+    assert_equal "cpu:0", Torch.device("cpu:0").to_s
+  end
 end
diff --git a/test/distributed_test.rb b/test/distributed_test.rb
index 90c82e17..7491c9a7 100644
--- a/test/distributed_test.rb
+++ b/test/distributed_test.rb
@@ -1,6 +1,7 @@
 require_relative "test_helper"
 require "torch/distributed"
 require "socket"
+require "timeout"
 
 class DistributedInitProcessGroupTest < Minitest::Test
   def setup
@@ -46,6 +47,8 @@ def test_uses_world_size_when_env_missing
 
   private
 
+  # Stub out low-level init to capture arguments without starting a real process group
+  # Used for upper-level tests that don't require actial process group spawning 
   def with_stubbed_init_process_group(calls)
     original = Torch::Distributed.method(:_init_process_group)
     Torch::Distributed.singleton_class.define_method(:_init_process_group) do |backend, store, rank, world_size, timeout_ms, device_id|
@@ -110,9 +113,10 @@ def skip_unless_backend_available!
   end
 
   def backend_available?
+    timeout = distributed_timeout
     port = Torch::Distributed.free_port
-    store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 1, true, wait_for_workers: false)
-    Torch::Distributed.init_process_group(backend, store: store, rank: 0, world_size: 1)
+    store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 1, true, wait_for_workers: false, timeout: timeout)
+    Torch::Distributed.init_process_group(backend, store: store, rank: 0, world_size: 1, timeout: timeout)
     true
   rescue StandardError => e
     return false if e.message =~ /not available/i || e.message =~ /unsupported backend/i
@@ -121,23 +125,30 @@ def backend_available?
     Torch::Distributed.destroy_process_group if Torch::Distributed.initialized?
   end
 
-  def nccl_device_id(rank)
-    rank
-  end
-
-  def fork_with_backend(world_size: 2, start_method: :fork)
+  def fork_with_backend(world_size: 2, start_method: :spawn)
+    timeout = distributed_timeout
     original_filter = ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY]
     original_script = ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY]
     ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = name if start_method == :spawn
     ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = File.expand_path(__FILE__) if start_method == :spawn
-    Torch::Distributed.fork_world(world_size, start_method: start_method) do |rank, port|
-      store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?)
-      device_id = backend == "nccl" ? nccl_device_id(rank) : nil
-      Torch::Distributed.init_process_group(backend, store: store, rank: rank, world_size: world_size, device_id: device_id)
-      begin
-        yield(rank)
-      ensure
-        Torch::Distributed.destroy_process_group
+    Timeout.timeout(timeout, Timeout::Error, "distributed test exceeded #{timeout}s") do
+      Torch::Distributed.fork_world(world_size, start_method: start_method) do |rank, port|
+        Timeout.timeout(timeout, Timeout::Error, "distributed worker #{rank} exceeded #{timeout}s") do
+          store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?, timeout: timeout)
+          Torch::Distributed.init_process_group(
+            backend,
+            store: store,
+            rank: rank,
+            world_size: world_size,
+            device_id: rank,
+            timeout: timeout
+          )
+          begin
+            yield(rank)
+          ensure
+            Torch::Distributed.destroy_process_group
+          end
+        end
       end
     end
   ensure
@@ -179,7 +190,8 @@ def test_broadcast
   end
 
   def test_ddp_gradient_sync
-    grads = fork_with_backend do |rank|
+    # autograd cannot run safely with fork-based multiprocessing; always use spawn here
+    grads = fork_with_backend(start_method: :spawn) do |rank|
       device = tensor_options[:device]
       model = Torch::NN::Linear.new(1, 1, bias: false)
       model = model.to(device) if device
@@ -198,10 +210,18 @@ def test_ddp_gradient_sync
       assert_in_delta 1.5, grad, 1e-6
     end
   end
+
+  def distributed_timeout
+    Integer(ENV.fetch("TORCH_DISTRIBUTED_TEST_TIMEOUT", "30"))
+  end
 end
 
 class DistributedGlooTest < DistributedBackendTest
   BACKEND = "gloo"
+
+  def fork_with_backend(world_size: 2, start_method: :fork)
+    super(world_size: world_size, start_method: start_method)
+  end
 end
 
 class DistributedNcclTest < DistributedBackendTest

From 3ab737c4edf75feee07ecbdbc3a88a2677a1f53f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i@orlando-labs.com>
Date: Fri, 5 Dec 2025 11:34:46 +0300
Subject: [PATCH 27/28] only multi-device core functionality remains

---
 README.md                   | 79 ++-----------------------------------
 ext/torch/ext.cpp           |  2 -
 ext/torch/extconf.rb        | 66 +------------------------------
 lib/torch.rb                |  2 -
 lib/torch/nn/module.rb      | 53 -------------------------
 lib/torch/nn/module_list.rb |  6 ++-
 test/nn/module_test.rb      | 13 ------
 test/test_helper.rb         | 26 ------------
 torch-rb.gemspec            |  4 +-
 9 files changed, 11 insertions(+), 240 deletions(-)

diff --git a/README.md b/README.md
index 72161779..320e6d3a 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ As well as:
 First, [download LibTorch](https://pytorch.org/get-started/locally/). For Mac arm64, use:
 
 ```sh
-curl -L https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.9.0.zip > libtorch.zip
+curl -L https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.9.1.zip > libtorch.zip
 unzip -q libtorch.zip
 ```
 
@@ -34,11 +34,6 @@ Then run:
 bundle config build.torch-rb --with-torch-dir=/path/to/libtorch
 ```
 
-In order to build distributed features (if your LibTorch supports it) add the following to the build config string:
-```sh
-... --with-cuda-include=/path/to/cuda/include --with-gloo-include=/path/to/gloo/repo
-```
-
 And add this line to your application’s Gemfile:
 
 ```ruby
@@ -47,6 +42,8 @@ gem "torch-rb"
 
 It can take 5-10 minutes to compile the extension. Windows is not currently supported.
 
+For distributed data parallel helpers, add the optional `torch-ddp` gem alongside this one.
+
 ## Getting Started
 
 A good place to start is [Deep Learning with Torch.rb: A 60 Minute Blitz](tutorials/blitz/README.md).
@@ -60,79 +57,9 @@ A good place to start is [Deep Learning with Torch.rb: A 60 Minute Blitz](tutori
 ## Examples
 
 - [Image classification with MNIST](examples/mnist) ([日本語版](https://qiita.com/kojix2/items/c19c36dc1bf73ea93409))
-- [Distributed MNIST training](examples/mnist/distributed.rb)
-- [Training benchmarks (variable batch size / GPU count)](examples/benchmark/training.rb)
 - [Collaborative filtering with MovieLens](examples/movielens)
 - [Generative adversarial networks](examples/gan)
 
-Run the benchmark with:
-
-```sh
-bundle exec ruby examples/benchmark/training.rb --arch mnist_cnn --batch-size 256 --gpus 1 --steps 50
-```
-
-Set `--gpus` to 2+ to enable distributed training; `--steps` measures only timed steps and `--warmup` sets warmup iterations.
-
-## Distributed Training
-
-Torch.rb ships with a `torchrun` launcher that mirrors the PyTorch CLI. It handles process orchestration and sets the `RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, and `MASTER_PORT` environment variables expected by `Torch::Distributed.init_process_group`.
-
-Start a single-node job with a process per GPU (or CPU) with:
-
-```sh
-bundle exec torchrun --standalone --nproc-per-node=gpu path/to/training_script.rb --script-arg value
-```
-
-For multi-node runs, launch the same command on every node with matching rendezvous settings:
-
-```sh
-bundle exec torchrun \
-  --nnodes=2 \
-  --node-rank=0 \
-  --rdzv-backend=c10d \
-  --rdzv-endpoint=host0.example.com:29503 \
-  --rdzv-id=my-job \
-  --nproc-per-node=4 \
-  path/to/training_script.rb
-```
-
-On node 1, change `--node-rank=1`. The launcher restarts workers up to `--max-restarts` times and can be combined with tools like `bundle exec` or custom scripts via `--no-ruby`.
-
-For scripts that use the `Torch::Distributed.fork_world` helper directly, set `start_method: :spawn` to launch fresh worker processes instead of forking. This matches Python’s multiprocessing start methods and avoids CUDA fork issues.
-
-### Distributed benchmark
-
-Generate a comparison table across backends, group sizes, and batch sizes:
-
-```sh
-bundle exec ruby examples/benchmark/training.rb --backends gloo,nccl --batch-sizes 32,64,128,256 --gpus 2 --steps 50
-```
-
-Example results on dual RTX 3090s:
-Processing speed: images per second. Convergence speed: average loss reduction per step and per second.
-
-```text
-Backend | Proc Group | Batch | Images/s |
---------+------------+-------+----------|
-gloo    | 1          | 32    | 1724.4   |
-gloo    | 1          | 64    | 1941.8   |
-gloo    | 1          | 128   | 2038.7   |
-gloo    | 1          | 256   | 2171.8   |
-gloo    | 2          | 32    | 2261.0   |
-gloo    | 2          | 64    | 2870.6   |
-gloo    | 2          | 128   | 3398.4   |
-gloo    | 2          | 256   | 3743.1   |
-nccl    | 1          | 32    | 1804.8   |
-nccl    | 1          | 64    | 1963.0   |
-nccl    | 1          | 128   | 2051.5   |
-nccl    | 1          | 256   | 2143.3   |
-nccl    | 2          | 32    | 3046.1   |
-nccl    | 2          | 64    | 3513.6   |
-nccl    | 2          | 128   | 3892.1   |
-nccl    | 2          | 256   | 4024.5   |
---------+------------+-------+----------|
-```
-
 ## API
 
 This library follows the [PyTorch API](https://pytorch.org/docs/stable/torch.html). There are a few changes to make it more Ruby-like:
diff --git a/ext/torch/ext.cpp b/ext/torch/ext.cpp
index dc9cef20..c07528b8 100644
--- a/ext/torch/ext.cpp
+++ b/ext/torch/ext.cpp
@@ -7,7 +7,6 @@ void init_linalg(Rice::Module& m);
 void init_nn(Rice::Module& m);
 void init_special(Rice::Module& m);
 void init_accelerator(Rice::Module& m);
-void init_distributed(Rice::Module& m);
 void init_tensor(Rice::Module& m, Rice::Class& c, Rice::Class& rb_cTensorOptions);
 void init_torch(Rice::Module& m);
 
@@ -49,5 +48,4 @@ void Init_ext() {
   init_generator(m, rb_cGenerator);
   init_ivalue(m, rb_cIValue);
   init_random(m);
-  init_distributed(m);
 }
diff --git a/ext/torch/extconf.rb b/ext/torch/extconf.rb
index 916588d9..0032088d 100644
--- a/ext/torch/extconf.rb
+++ b/ext/torch/extconf.rb
@@ -38,9 +38,6 @@
 cudnn_inc, cudnn_lib = dir_config("cudnn")
 cudnn_lib ||= "/usr/local/cuda/lib"
 
-gloo_inc, _ = dir_config("gloo")
-gloo_inc ||= "./vendor/gloo"
-
 $LDFLAGS += " -L#{lib}" if Dir.exist?(lib)
 abort "LibTorch not found" unless have_library("torch")
 
@@ -50,7 +47,7 @@
 with_cuda = false
 if Dir["#{lib}/*torch_cuda*"].any?
   $LDFLAGS += " -L#{cuda_lib}" if Dir.exist?(cuda_lib)
-  $INCFLAGS += " -I#{cuda_inc}" if Dir.exist?(cuda_inc)
+  $INCFLAGS += " -I#{cuda_inc}" if cuda_inc && Dir.exist?(cuda_inc)
   $LDFLAGS += " -L#{cudnn_lib}" if Dir.exist?(cudnn_lib) && cudnn_lib != cuda_lib
   with_cuda = have_library("cuda") && have_library("cudnn")
 end
@@ -61,6 +58,7 @@
 CONFIG["CC"] = CONFIG["CXX"]
 $CFLAGS = $CXXFLAGS
 
+abort "cuda.h not found" if with_cuda && !find_header("cuda.h")
 supports_c10_cuda = with_cuda && try_compile(<<~CPP)
   #include <torch/torch.h>
   #include <c10/cuda/CUDAFunctions.h>
@@ -91,66 +89,6 @@
   $LDFLAGS += " -Wl,--no-as-needed,#{lib}/libtorch.so"
 end
 
-supports_c10d = try_link(<<~CPP, "-DUSE_C10D")
-  #include <torch/torch.h>
-  #include <torch/csrc/distributed/c10d/FileStore.hpp>
-
-  int main() {
-    ::c10d::FileStore store("unused", 1);
-    return 0;
-  }
-CPP
-
-if supports_c10d
-  $defs << " -DUSE_C10D"
-  puts "Building with distributed support"
-
-  if find_header("gloo/algorithm.h", gloo_inc)
-    $INCFLAGS += " -I#{gloo_inc}"
-  else
-    puts "GLOO headers not found. Consider setting --with-gloo-include param"
-  end
-else
-  puts "Building without distributed support"
-end
-
-supports_c10d_gloo = supports_c10d && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_GLOO")
-  #include <torch/torch.h>
-  #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
-  #include <torch/csrc/distributed/c10d/FileStore.hpp>
-
-  int main() {
-    auto store = c10::make_intrusive<::c10d::FileStore>("unused", 1);
-    auto opts = ::c10d::ProcessGroupGloo::Options::create();
-    opts->devices.push_back(::c10d::ProcessGroupGloo::createDefaultDevice());
-    ::c10d::ProcessGroupGloo pg(store, 0, 1, opts);
-    return static_cast<int>(pg.getRank());
-  }
-CPP
-
-supports_c10d_nccl = with_cuda && supports_c10_cuda && try_link(<<~CPP, "-DUSE_C10D -DUSE_C10D_NCCL")
-  #include <torch/torch.h>
-  #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
-
-  int main() {
-    auto opts = c10::make_intrusive<::c10d::ProcessGroupNCCL::Options>();
-    opts->is_high_priority_stream = false;
-    return 0;
-  }
-CPP
-
-if supports_c10d_gloo
-  $defs << "-DUSE_C10D_GLOO"
-  puts "GLOO support detected"
-end
-unless supports_c10_cuda
-  puts "No c10 CUDA headers found. NCCL is unavailable"
-end
-if supports_c10d_nccl
-  $defs << "-DUSE_C10D_NCCL"
-  puts "NCCL support detected"
-end
-
 # generate C++ functions
 puts "Generating C++ functions..."
 require_relative "../../codegen/generate_functions"
diff --git a/lib/torch.rb b/lib/torch.rb
index dd652872..667315a5 100644
--- a/lib/torch.rb
+++ b/lib/torch.rb
@@ -10,7 +10,6 @@
 # modules
 require_relative "torch/device"
 require_relative "torch/accelerator"
-require_relative "torch/distributed"
 require_relative "torch/inspector"
 require_relative "torch/tensor"
 require_relative "torch/version"
@@ -193,7 +192,6 @@
 require_relative "torch/nn/functional"
 require_relative "torch/nn/functional_attention"
 require_relative "torch/nn/init"
-require_relative "torch/nn/parallel/distributed_data_parallel"
 
 # utils
 require_relative "torch/utils/data"
diff --git a/lib/torch/nn/module.rb b/lib/torch/nn/module.rb
index df0500b5..f9a76f98 100644
--- a/lib/torch/nn/module.rb
+++ b/lib/torch/nn/module.rb
@@ -437,58 +437,5 @@ def dup_value(v, memo)
         end
       end
     end
-
-    class ModuleList < Module
-      def initialize(mods = nil)
-        super()
-
-        return unless mods
-        self.extend(mods)
-      end
-
-      def length
-        @modules.length
-      end
-
-      alias :count :length
-
-      def extend(mods)
-        raise ArgumentError, "Modules should respond to #each" unless mods.respond_to?(:each)
-
-        mods.each { |m| append m }
-
-        self
-      end
-
-      def each(&block)
-        @modules.values.each &block
-      end
-
-      def map(&block)
-        @modules.values.map &block
-      end
-
-      def inject(inj, &block)
-        @modules.values.inject(inj, &block)
-      end
-
-      def append(mod)
-        raise ArgumentError, "Provided element is not a module" unless mod.is_a?(Module)
-        add_module(length.to_s, mod)
-        self
-      end
-
-      def [](*idx)
-        idx.map do |id|
-          if id.is_a?(Integer)
-            @modules[id.to_s]
-          elsif id.is_a?(Range)
-            id.each do |i|
-              @modules[i.to_s]
-            end
-          end
-        end.flatten
-      end
-    end
   end
 end
diff --git a/lib/torch/nn/module_list.rb b/lib/torch/nn/module_list.rb
index 925bab5b..02c17575 100644
--- a/lib/torch/nn/module_list.rb
+++ b/lib/torch/nn/module_list.rb
@@ -6,7 +6,7 @@ class ModuleList < Module
       def initialize(mods = nil)
         super()
 
-        self.concat(mods) if mods
+        concat(mods) if mods
       end
 
       def length
@@ -31,6 +31,10 @@ def each(&block)
         end
       end
 
+      def map(&block)
+        @modules.values.map(&block)
+      end
+
       def append(mod)
         raise ArgumentError, "Provided element is not a module" unless mod.is_a?(Module)
         add_module(length.to_s, mod)
diff --git a/test/nn/module_test.rb b/test/nn/module_test.rb
index 33790da0..d52429ea 100644
--- a/test/nn/module_test.rb
+++ b/test/nn/module_test.rb
@@ -71,19 +71,6 @@ def test_state_dict_buffers
     net.eval
   end
 
-  def test_state_dict_with_buffers
-    net = SimpleResidualBlock.new
-    expected_keys = %w[seq.0.weight seq.1.weight seq.1.bias seq.1.running_mean seq.1.running_var seq.1.num_batches_tracked seq.3.weight seq.4.weight seq.4.bias seq.4.running_mean seq.4.running_var seq.4.num_batches_tracked seq.6.weight seq.7.weight seq.7.bias seq.7.running_mean seq.7.running_var seq.7.num_batches_tracked]
-    assert_equal expected_keys, net.state_dict.keys
-
-    tmpfile = Tempfile.new
-    Torch.save net.state_dict, tmpfile.path
-
-    net = SimpleResidualBlock.new
-    net.load_state_dict Torch.load tmpfile.path
-    net.eval
-  end
-
   def test_inspect
     assert_match "(conv1): Conv2d(1, 6, kernel_size: [3, 3], stride: [1, 1])", net.inspect
   end
diff --git a/test/test_helper.rb b/test/test_helper.rb
index 76f913cf..347bbb4f 100644
--- a/test/test_helper.rb
+++ b/test/test_helper.rb
@@ -1,33 +1,7 @@
-spawn_worker = ENV["TORCH_DISTRIBUTED_SPAWNED"] == "1"
-
-# Spawned distributed workers shouldn't try to load minitest plugins from the
-# parent test environment.
-ENV["MT_NO_PLUGINS"] = "1" if spawn_worker
-
 require "bundler/setup"
 Bundler.require(:default)
 require "minitest/autorun"
 
-if spawn_worker
-  module TorchDistributedSpawnTest
-    module QuietSummaryReporter
-      def start # :nodoc:
-        Minitest::StatisticsReporter.instance_method(:start).bind(self).call
-        self.sync = io.respond_to?(:"sync=")
-        self.old_sync, io.sync = io.sync, true if self.sync
-      end
-
-      def report # :nodoc:
-        super
-      ensure
-        io.sync = self.old_sync if self.sync
-      end
-    end
-  end
-
-  Minitest::SummaryReporter.prepend(TorchDistributedSpawnTest::QuietSummaryReporter)
-end
-
 # support
 require_relative "support/net"
 
diff --git a/torch-rb.gemspec b/torch-rb.gemspec
index 40c89325..0adcc03b 100644
--- a/torch-rb.gemspec
+++ b/torch-rb.gemspec
@@ -10,9 +10,7 @@ Gem::Specification.new do |spec|
   spec.author        = "Andrew Kane"
   spec.email         = "andrew@ankane.org"
 
-  spec.files         = Dir["*.{md,txt}", "{codegen,ext,lib}/**/*", "bin/*"]
-  spec.executables   = Dir["bin/*"].map { |file| File.basename(file) }
-  spec.bindir        = "bin"
+  spec.files         = Dir["*.{md,txt}", "{codegen,ext,lib}/**/*"]
   spec.require_path  = "lib"
   spec.extensions    = ["ext/torch/extconf.rb"]
 

From 03f468264bc112cf1b0bc1f887905f3841aa10a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=20=D0=A0=D0=B0=D0=B7=D1=83=D0=B2?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=B2?= <i@orlando-labs.com>
Date: Fri, 5 Dec 2025 11:40:18 +0300
Subject: [PATCH 28/28] distributed tests removed

---
 test/distributed_test.rb | 243 ---------------------------------------
 1 file changed, 243 deletions(-)
 delete mode 100644 test/distributed_test.rb

diff --git a/test/distributed_test.rb b/test/distributed_test.rb
deleted file mode 100644
index 7491c9a7..00000000
--- a/test/distributed_test.rb
+++ /dev/null
@@ -1,243 +0,0 @@
-require_relative "test_helper"
-require "torch/distributed"
-require "socket"
-require "timeout"
-
-class DistributedInitProcessGroupTest < Minitest::Test
-  def setup
-    skip "Distributed backend not available" unless Torch::Distributed.available?
-    skip "CUDA not available for NCCL backend" unless cuda_available?
-  end
-
-  def test_defaults_nccl_device_id_from_local_rank_env
-    calls = []
-    with_stubbed_init_process_group(calls) do
-      ENV["LOCAL_RANK"] = "2"
-      Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 5, world_size: 8)
-    ensure
-      ENV.delete("LOCAL_RANK")
-    end
-
-    assert_equal 1, calls.size
-    assert_equal 2, calls.first[:device_id]
-  end
-
-  def test_falls_back_to_local_world_size_modulo
-    calls = []
-    with_stubbed_init_process_group(calls) do
-      ENV["LOCAL_WORLD_SIZE"] = "2"
-      Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 3, world_size: 4)
-    ensure
-      ENV.delete("LOCAL_WORLD_SIZE")
-    end
-
-    assert_equal 1, calls.size
-    assert_equal 1, calls.first[:device_id]
-  end
-
-  def test_uses_world_size_when_env_missing
-    calls = []
-    with_stubbed_init_process_group(calls) do
-      Torch::Distributed.init_process_group("nccl", store: Object.new, rank: 1, world_size: 2)
-    end
-
-    assert_equal 1, calls.size
-    assert_equal 1, calls.first[:device_id]
-  end
-
-  private
-
-  # Stub out low-level init to capture arguments without starting a real process group
-  # Used for upper-level tests that don't require actial process group spawning 
-  def with_stubbed_init_process_group(calls)
-    original = Torch::Distributed.method(:_init_process_group)
-    Torch::Distributed.singleton_class.define_method(:_init_process_group) do |backend, store, rank, world_size, timeout_ms, device_id|
-      calls << {backend: backend, rank: rank, world_size: world_size, timeout_ms: timeout_ms, device_id: device_id}
-      :stub
-    end
-    yield
-  ensure
-    Torch::Distributed.singleton_class.define_method(:_init_process_group, original)
-  end
-
-  def cuda_available?
-    Torch.const_defined?(:CUDA) && Torch::CUDA.respond_to?(:available?) && Torch::CUDA.available?
-  end
-end
-
-class DistributedSpawnStartMethodTest < Minitest::Test
-  def test_spawn_worker_env_runs_block
-    reader, writer = IO.pipe
-    writer.close_on_exec = false
-
-    pid = fork do
-      reader.close
-      ENV[Torch::Distributed::SPAWN_ENV_KEY] = "1"
-      ENV[Torch::Distributed::SPAWN_RANK_ENV_KEY] = "0"
-      ENV[Torch::Distributed::SPAWN_WORLD_SIZE_ENV_KEY] = "1"
-      ENV[Torch::Distributed::SPAWN_PORT_ENV_KEY] = "1234"
-      ENV[Torch::Distributed::SPAWN_PIPE_ENV_KEY] = writer.fileno.to_s
-      Torch::Distributed.fork_world(1, start_method: :spawn) { |rank, port| [rank, port] }
-    end
-
-    writer.close
-    result = Marshal.load(reader)
-    reader.close
-
-    _pid, status = Process.wait2(pid)
-    assert status.success?
-    assert_equal [0, 1234], result
-  end
-end
-
-class DistributedBackendTest < Minitest::Test
-  BACKEND = nil
-
-  def setup
-    super
-    skip "Distributed backend not available" unless Torch::Distributed.available?
-    skip "No backend configured for test" unless backend
-    skip_unless_backend_available!
-  end
-
-  def backend
-    self.class::BACKEND
-  end
-
-  def tensor_options
-    {}
-  end
-
-  def skip_unless_backend_available!
-    skip "#{backend} backend not available" unless backend_available?
-  end
-
-  def backend_available?
-    timeout = distributed_timeout
-    port = Torch::Distributed.free_port
-    store = Torch::Distributed::TCPStore.new("127.0.0.1", port, 1, true, wait_for_workers: false, timeout: timeout)
-    Torch::Distributed.init_process_group(backend, store: store, rank: 0, world_size: 1, timeout: timeout)
-    true
-  rescue StandardError => e
-    return false if e.message =~ /not available/i || e.message =~ /unsupported backend/i
-    raise
-  ensure
-    Torch::Distributed.destroy_process_group if Torch::Distributed.initialized?
-  end
-
-  def fork_with_backend(world_size: 2, start_method: :spawn)
-    timeout = distributed_timeout
-    original_filter = ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY]
-    original_script = ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY]
-    ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = name if start_method == :spawn
-    ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = File.expand_path(__FILE__) if start_method == :spawn
-    Timeout.timeout(timeout, Timeout::Error, "distributed test exceeded #{timeout}s") do
-      Torch::Distributed.fork_world(world_size, start_method: start_method) do |rank, port|
-        Timeout.timeout(timeout, Timeout::Error, "distributed worker #{rank} exceeded #{timeout}s") do
-          store = Torch::Distributed::TCPStore.new("127.0.0.1", port, world_size, rank.zero?, timeout: timeout)
-          Torch::Distributed.init_process_group(
-            backend,
-            store: store,
-            rank: rank,
-            world_size: world_size,
-            device_id: rank,
-            timeout: timeout
-          )
-          begin
-            yield(rank)
-          ensure
-            Torch::Distributed.destroy_process_group
-          end
-        end
-      end
-    end
-  ensure
-    ENV[Torch::Distributed::SPAWN_TEST_ENV_KEY] = original_filter
-    ENV[Torch::Distributed::SPAWN_SCRIPT_ENV_KEY] = original_script
-  end
-
-  def test_all_reduce
-    results = fork_with_backend do |rank|
-      tensor = Torch.tensor([rank + 1.0], **tensor_options)
-      Torch::Distributed.all_reduce(tensor)
-      tensor.to_a
-    end
-
-    assert_equal [[3.0], [3.0]], results
-  end
-
-  def test_barrier
-    wait_times = fork_with_backend do |rank|
-      sleep 0.3 if rank.zero?
-      before = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-      Torch::Distributed.barrier
-      after = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-      after - before
-    end
-
-    assert_operator wait_times.first, :<, 0.1
-    assert_operator wait_times.last, :>=, 0.25
-  end
-
-  def test_broadcast
-    tensors = fork_with_backend do |rank|
-      tensor = Torch.tensor([rank + 1.0], **tensor_options)
-      Torch::Distributed.broadcast(tensor, src: 0)
-      tensor.to_a
-    end
-
-    assert_equal [[1.0], [1.0]], tensors
-  end
-
-  def test_ddp_gradient_sync
-    # autograd cannot run safely with fork-based multiprocessing; always use spawn here
-    grads = fork_with_backend(start_method: :spawn) do |rank|
-      device = tensor_options[:device]
-      model = Torch::NN::Linear.new(1, 1, bias: false)
-      model = model.to(device) if device
-      ddp = Torch::NN::Parallel::DistributedDataParallel.new(model)
-      input = Torch.tensor([[rank + 1.0]], **tensor_options)
-      output = ddp.call(input)
-      loss = output.sum
-      loss.backward
-
-      grad = model.parameters.first.grad
-      grad = grad.to("cpu") if device
-      grad.item
-    end
-
-    grads.each do |grad|
-      assert_in_delta 1.5, grad, 1e-6
-    end
-  end
-
-  def distributed_timeout
-    Integer(ENV.fetch("TORCH_DISTRIBUTED_TEST_TIMEOUT", "30"))
-  end
-end
-
-class DistributedGlooTest < DistributedBackendTest
-  BACKEND = "gloo"
-
-  def fork_with_backend(world_size: 2, start_method: :fork)
-    super(world_size: world_size, start_method: start_method)
-  end
-end
-
-class DistributedNcclTest < DistributedBackendTest
-  BACKEND = "nccl"
-
-  def setup
-    skip "CUDA not available for NCCL backend" unless Torch.const_defined?(:CUDA) && Torch::CUDA.available?
-    skip "Need at least 2 CUDA devices for NCCL tests" unless Torch::CUDA.device_count >= 2
-    super
-  end
-
-  def tensor_options
-    {device: "cuda"}
-  end
-
-  def fork_with_backend(world_size: 2, start_method: :spawn)
-    super(world_size: world_size, start_method: start_method)
-  end
-end