Broadcast constants on vectorised stores in CppTile2DKernel (pytorch#140262)

kundaMwiza · pytorchmergebot · commit f8a64c324e40 · 2024-12-03T09:15:17.000Z
Currently constants are not broadcasted on vectorised stores in `CppTile2DKernel`. This leads to errors like the following: ```shell error:: request for member 'store' in 'tmp1', which is of non-class type 'signed char' 61 | tmp1.store(tmp2 + static_cast<int64_t>(8L*x0_inner), static_cast<int64_t>(8)); | ^~~~~ ``` This PR adds the required broadcasting. Fixes #ISSUE_NUMBER Pull Request resolved: pytorch#140262 Approved by: https://github.com/jgong5
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
@@ -3394,6 +3394,59 @@ def f(a):
         x = torch.rand(4, 5)
         self.common(f, (x,))
 
+    def test_broadcast_scalar_cpp_tile_2d_kernel(self):
+        # Based on detectron2_maskrcnn backbone (conv2d -> max_pool2d)
+        s0 = 12
+        s1 = 21
+
+        data = torch.randn(
+            [1, 256, 8 * s0, 8 * s1],
+        )
+        weight_one = torch.randn([256, 256, 1, 1], requires_grad=True)
+        weight_two = torch.randn((256, 256, 3, 3), requires_grad=True)
+        bias_one = torch.randn([256], requires_grad=True)
+        bias_two = torch.randn([256], requires_grad=True)
+
+        @torch.compile
+        def fn(data, weight_one, weight_two, bias_one, bias_two):
+            conv_result_one = torch.ops.aten.convolution.default(
+                data,
+                weight_one,
+                bias_one,
+                [1, 1],
+                [1, 1],
+                [1, 1],
+                False,
+                [0, 0],
+                1,
+            )
+
+            conv_result_two = torch.ops.aten.convolution.default(
+                data,
+                weight_two,
+                bias_two,
+                [1, 1],
+                [1, 1],
+                [1, 1],
+                False,
+                [0, 0],
+                1,
+            )
+
+            max_pool_result = torch.nn.functional.max_pool2d(
+                conv_result_one,
+                [1, 1],
+                [2, 2],
+                [0, 0],
+                [1, 1],
+                False,
+            )
+            return conv_result_one, conv_result_two, max_pool_result
+
+        torch._dynamo.mark_dynamic(data, 2)
+        torch._dynamo.mark_dynamic(data, 3)
+        self.common(fn, (data, weight_one, weight_two, bias_one, bias_two))
+
     def test_to_channels_last_lowp_fp(self):
         def f(a):
             return a.to(memory_format=torch.channels_last)
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
@@ -3284,6 +3284,11 @@ def load(self, name: str, index: sympy.Expr):
 
     def store(self, name, index, value, mode=None):
         assert "buf" in name
+        assert isinstance(value, CppCSEVariable), value
+        if not value.is_vec:
+            # this happens when we store a scalar into a vectorized buffer like "fill"
+            value = self.broadcast(value)
+
         var = self.args.output(name)
 
         inner = self.inner_itervar()