fix cat with out args (#1053) (#1074)

zhuhaozhe · web-flow · commit 4381f9126bbb · 2022-08-25T12:16:05.000+09:00
diff --git a/intel_extension_for_pytorch/csrc/aten/cpu/TensorShape.cpp b/intel_extension_for_pytorch/csrc/aten/cpu/TensorShape.cpp
@@ -38,6 +38,42 @@
 namespace torch_ipex {
 namespace cpu {
 
+using namespace at;
+
+void resize_out(
+    const Tensor& out,
+    IntArrayRef sizes,
+    IntArrayRef strides,
+    const TensorOptions& options) {
+  TORCH_CHECK(
+      options.dtype() == out.dtype(),
+      "Expected out tensor to have dtype ",
+      options.dtype(),
+      ", but got ",
+      out.dtype(),
+      " instead");
+  TORCH_CHECK(
+      options.device() == out.device(),
+      "Expected out tensor to have device ",
+      options.device(),
+      ", but got ",
+      out.device(),
+      " instead");
+  const bool resized = at::native::resize_output(out, sizes);
+  // Only restride if a resize occurred; otherwise we ignore the (advisory)
+  // strides from the meta function and directly use the output tensor's
+  // preexisting strides
+  if (resized) {
+    if (!strides.empty()) {
+      TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
+      at::native::as_strided_(out, sizes, strides);
+    } else if (options.memory_format_opt().has_value()) {
+      out.unsafeGetTensorImpl()->empty_tensor_restride(
+          *options.memory_format_opt());
+    }
+  }
+}
+
 DEFINE_DISPATCH(cat_contig_stub);
 
 inline void cat_check_no_zero_dim(
@@ -169,7 +205,11 @@ at::Tensor& cat_out_cpu(
             memory_format);
   }
 
-  result = at::empty(sizes, options);
+  if (result.defined()) {
+    resize_out(result, sizes, /*strides=*/{}, options);
+  } else {
+    result = at::empty(sizes, options);
+  }
   // Checks for overlaps between the inputs and the output tensor.
   if (is_out_defined && found_valid_tensor) {
     at::assert_no_internal_overlap(result);
diff --git a/tests/cpu/test_cpu_ops.py b/tests/cpu/test_cpu_ops.py
@@ -5,6 +5,7 @@
 import random
 import intel_extension_for_pytorch as ipex
 from common_utils import TestCase
+import itertools
 
 try:
     import torchvision
@@ -776,14 +777,84 @@ def test_index_select(self):
         self.assertEqual(y2, y, prec=0.01)
 
     def test_cat(self):
-        x = x = torch.randn(2, 3)
-        y = torch.cat((x, x, x), 0)
-
-        # test bfloat16
-        x2 = x.clone().detach().bfloat16()
-        y2 = torch.cat((x2, x2, x2), 0)
-        self.assertTrue(y2.dtype == torch.bfloat16)
-        self.assertEqual(y2, y, prec=0.01)
+        for datatype in [torch.float32, torch.double, torch.bfloat16]:
+            for dim, size in itertools.product([0, 1], [[2, 1], [2, 2], [5, 10]]):
+                x = torch.randn(size, dtype=datatype)
+                y = torch.cat([x, x], dim)
+                self.assertTrue(y.dtype == datatype)
+
+            # long input tensor list
+            x1 = torch.randn((2, 2), dtype=datatype)
+            input1 = []
+            for i in range(100):
+                input1.append(x1)
+            y1 = torch.cat(input1, 0)
+            self.assertTrue(y1.size() == torch.Size([200, 2]))
+            self.assertTrue(y1.dtype == datatype)
+
+            # input tensors have different shapes and strides
+            x2 = torch.randn((400, 2), dtype=datatype)
+            input2 = []
+            for i in range(10):
+                input2.append(x1)
+            for i in range(100):
+                input2.append(x2)
+            y2 = torch.cat(input2, 0)
+            self.assertTrue(y2.size() == torch.Size([40020, 2]))
+            self.assertTrue(y2.dtype == datatype)
+
+            x3 = torch.randn((4000, 2), dtype=datatype)
+            input3 = []
+            for i in range(10):
+                input3.append(x1)
+            for i in range(10):
+                input3.append(x3)
+            y3 = torch.cat(input3, 0)
+            self.assertTrue(y3.size() == torch.Size([40020, 2]))
+            self.assertTrue(y3.dtype == datatype)
+
+            x4 = torch.randn((4, 2), dtype=datatype)
+            input4 = []
+            for i in range(10):
+                input4.append(x1)
+            for i in range(10):
+                input4.append(x4)
+            y4 = torch.cat(input4, 0)
+            self.assertTrue(y4.size() == torch.Size([60, 2]))
+            self.assertTrue(y4.dtype == datatype)
+
+            # "out" arg is used but  un-defined
+            y5 = torch.cat([x4, x4], 0, out=torch.empty(0, dtype=datatype))
+            self.assertEqual(y5, torch.cat([x4, x4], 0))
+            self.assertTrue(y5.dtype == datatype)
+
+            # out is defined with wrong shape
+            ref = torch.cat([x4, x4], 0)
+            out = torch.zeros(1)
+            out_ptr = out.data_ptr()
+            torch.cat([x4, x4], 0, out=out)
+            self.assertEqual(ref, out)
+            self.assertTrue(ref.dtype == datatype)
+            self.assertTrue(out_ptr != out.data_ptr())
+
+            # out is defined with correct shape
+            ref = torch.cat([x4, x4], 0)
+            out = torch.zeros_like(ref)
+            out_ptr = out.data_ptr()
+            torch.cat([x4, x4], 0, out=out)
+            self.assertEqual(ref, out)
+            self.assertTrue(ref.dtype == datatype)
+            self.assertTrue(out_ptr == out.data_ptr())
+
+            y6 = torch.cat([x4, x4], 0, out=torch.empty(0, dtype=torch.float32))
+            self.assertEqual(y6, torch.cat([x4, x4], 0))
+            self.assertTrue(y6.dtype == torch.float32)
+
+            # one of input tensors is empty
+            x7 = torch.empty(0, dtype=datatype)
+            y7 = torch.cat([x4, x4, x7], 0)
+            self.assertTrue(y7.size() == torch.Size([8, 2]))
+            self.assertTrue(y7.dtype == datatype)
 
 if __name__ == '__main__':
     test = unittest.main()