Add private API to set blas backend (#1050)

EikanWang · web-flow · commit 52b3c1ae8ded · 2022-08-15T11:01:08.000+08:00
* Add private API to set blas backend

* Fix UT
diff --git a/intel_extension_for_pytorch/__init__.py b/intel_extension_for_pytorch/__init__.py
@@ -29,4 +29,4 @@
 from . import autocast
 
 from .utils.verbose import verbose
-from .frontend import optimize, enable_onednn_fusion, set_fp32_math_mode, get_fp32_math_mode, FP32MathMode
+from .frontend import optimize, enable_onednn_fusion, set_fp32_math_mode, get_fp32_math_mode, FP32MathMode, _set_blas_backend, _is_mkl_blas_backend, _is_dnnl_blas_backend
diff --git a/intel_extension_for_pytorch/frontend.py b/intel_extension_for_pytorch/frontend.py
@@ -159,7 +159,7 @@ def optimize(
             input data will impact the block format of packed weight. If not feed a sample
             input, Intel® Extension for PyTorch* will pack the weight per some predefined heuristics.
             If feed a sample input with real input shape, Intel® Extension for PyTorch* can get
-            best block format.            
+            best block format.
         auto_kernel_selection (bool) [experimental]: Different backends may have
             different performances with different dtypes/shapes. Default value
             is False. Intel® Extension for PyTorch* will try to optimize the
@@ -241,7 +241,7 @@ def optimize(
     if fuse_update_step is not None:
         opt_properties.fuse_update_step = fuse_update_step
     if auto_kernel_selection is not None:
-        opt_properties.auto_kernel_selection = auto_kernel_selection        
+        opt_properties.auto_kernel_selection = auto_kernel_selection
 
     if inplace:
         optimized_model = model
@@ -253,7 +253,7 @@ def optimize(
         if isinstance(sample_input, torch.Tensor):
             sample_input = (sample_input,)
         utils._weight_prepack.record_input_shape_for_prepack(optimized_model, sample_input)
-    
+
     if not model.training:
         if opt_properties.conv_bn_folding:
             try:
@@ -384,3 +384,12 @@ def get_fp32_math_mode(device="cpu"):
     """
 
     return core.get_fp32_math_mode()
+
+def _set_blas_backend(backend="dnnl"):
+    utils._weight_prepack.BlasBackend.set_backend(backend)
+
+def _is_mkl_blas_backend():
+    return utils._weight_prepack.BlasBackend.is_mkl()
+
+def _is_dnnl_blas_backend():
+    return utils._weight_prepack.BlasBackend.is_dnnl()
diff --git a/intel_extension_for_pytorch/nn/utils/_weight_prepack.py b/intel_extension_for_pytorch/nn/utils/_weight_prepack.py
@@ -8,6 +8,21 @@
 
 logger = logging.getLogger(__name__)
 
+class BlasBackend:
+    _blas_backend = "dnnl"
+
+    @classmethod
+    def set_backend(cls, backend="dnnl"):
+        cls._blas_backend = backend
+
+    @classmethod
+    def is_mkl(cls):
+        return cls._blas_backend == "mkl"
+
+    @classmethod
+    def is_dnnl(cls):
+        return cls._blas_backend == "dnnl"
+
 class _IPEXConvNd(nn.Module):
     __constants__ = ['stride', 'padding', 'dilation', 'groups',
                      'out_channels', 'kernel_size']
@@ -302,7 +317,7 @@ def convert(m, optimizer, params_attr, auto_kernel_selection):
             if weight not in params_attr:
                 params_attr[weight] = {}
             if type(m) is torch.nn.Linear:
-                if m.weight.dtype == torch.float32 and optimizer is None and frontend.get_fp32_math_mode(device="cpu") == frontend.FP32MathMode.FP32:
+                if BlasBackend.is_mkl() and m.weight.dtype == torch.float32 and optimizer is None and frontend.get_fp32_math_mode(device="cpu") == frontend.FP32MathMode.FP32:
                     new_m = IPEX_WEIGHT_PREPACK_MODULE[type(m)](m, use_dnnl = False)
                 else:
                     new_m = IPEX_WEIGHT_PREPACK_MODULE[type(m)](m, use_dnnl = True)
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
@@ -1078,7 +1078,7 @@ def _test_output(self, base_model, x, kind_in_graph=None, kind_not_in_graph=None
                 if kind_not_in_graph is not None:
                     self.assertTrue(all(n.kind() != kind_not_in_graph for n in trace_graph.nodes()))
 
-    def _test_mkl_fp32(self, model, input, kind_in_graph=None, prec=5e-3):
+    def _test_blas_backend_fp32(self, model, input, kind_in_graph=None, prec=5e-3):
         model = model.eval()
         model = ipex.optimize(model, dtype=torch.float32, auto_kernel_selection=True)
         with torch.no_grad():
@@ -1210,19 +1210,23 @@ def check_op_count(graph_str, op_names=[]):
             linear_count_ori = check_op_count(graph_opt, ["aten::linear"])
             self.assertEqual(linear_count_ori, 2)
 #call prepack mkl path(fp32)
+        ipex._set_blas_backend("mkl")
         model = ipex.optimize(origin_model, dtype=torch.float32, auto_kernel_selection=True)
         ori_res = model(test_val1)
         with torch.no_grad():
             model_jit = torch.jit.trace(model,(test_val1))
             graph_ori = str(model_jit.graph_for(test_val1))
             linear_count_ori = check_op_count(graph_ori, ["torch_ipex::ipex_MKLSGEMM"])
             self.assertEqual(linear_count_ori, 4)
+
             model_jit = torch.jit.freeze(model_jit)
             jit_res = model_jit(test_val1)
             self.assertEqual(ori_res, jit_res)
+
             graph_opt = str(model_jit.graph_for(test_val1))
             linear_count_ori = check_op_count(graph_opt, ["ipex_prepack::mkl_sgemm_run"])
             self.assertEqual(linear_count_ori, 2)
+        ipex._set_blas_backend("dnnl")
 
         model = ipex.optimize(origin_model, dtype=torch.bfloat16)
         test_val1 = test_val1.bfloat16()
@@ -1286,7 +1290,7 @@ def test_add_layernorm(self):
                 c = torch.randn(bs, seq_len, dim)
                 jit_model = torch.jit.trace(model,(a, b, c))
                 trace_graph = jit_model.graph_for(a, b, c)
-                
+
                 jit_res = jit_model(a, b, c)
                 ori_res = model(a, b, c)
                 self.assertEqual(jit_res, ori_res)
@@ -1495,7 +1499,7 @@ def _test_pure_bf16_parts(model, trace_model, qk, mask, prec=3e-2):
             res_jit = trace_model(qk_bf16, mask_bf16)
             self.assertEqual(res_ref, res_jit, prec=prec)
             _check_match_mha_parts(trace_model, qk_bf16, mask)
-        
+
         for sequance_length in [128, 100]:
             mat1 = torch.randn(56, 12, sequance_length, sequance_length)
             mat2 = torch.randn(56, 12, sequance_length, sequance_length)
@@ -2618,8 +2622,9 @@ def test_conv_transpose_sigmoid_mul(self):
 
     def test_linear_auto_kernel_selection_fp32(self):
         x = torch.rand(32, 3)
-        options = itertools.product(['O0', 'O1'], [True, False])
-        for level, auto_select_kernel in options:
+        options = itertools.product(['O0', 'O1'], [True, False], ["mkl", "dnnl"])
+        for level, auto_select_kernel, blas_backend in options:
+            ipex._set_blas_backend(blas_backend)
             model = LinearRelu(3, 32, bias=True).eval()
             model = ipex.optimize(model, dtype=torch.float32, level=level, auto_kernel_selection=auto_select_kernel)
             with torch.no_grad():
@@ -2629,10 +2634,11 @@ def test_linear_auto_kernel_selection_fp32(self):
                 trace_graph = traced_model.graph_for(x)
 
                 if auto_select_kernel and level == 'O1':
-# for auto_select_kernel is True and level is O1, we will use ipex prepacked MKL linear
-                    self.assertTrue(any(n.kind() == 'ipex_prepack::mkl_sgemm_run' for n in trace_graph.nodes()))
+                    if ipex._is_mkl_blas_backend():
+                        self.assertTrue(any(n.kind() == 'ipex_prepack::mkl_sgemm_run' for n in trace_graph.nodes()))
+                    else:
+                        self.assertTrue(any(n.kind() == 'ipex_prepack::linear_relu_run' for n in trace_graph.nodes()))
                 else:
-# auto_select_kernel is false, we will use mkl linear
                     self.assertTrue(any(n.kind() == 'aten::linear' for n in trace_graph.nodes()))
 
     def test_linear_auto_kernel_selection_bf16(self):
@@ -2788,10 +2794,22 @@ def _test_linear_unary_fusion(self, op_list, seed=None):
                     m,
                     x,
                     kind_in_graph="aten::linear")
-                self._test_mkl_fp32(
+
+                blas_backend = {"mkl":"ipex_prepack::mkl_sgemm_run"}
+                for _blas in blas_backend.keys():
+                    ipex._set_blas_backend(_blas)
+                    self._test_blas_backend_fp32(
+                        m,
+                        x,
+                        kind_in_graph=blas_backend[_blas])
+
+                ipex._set_blas_backend("dnnl")
+                self._test_blas_backend_fp32(
                     m,
                     x,
-                    kind_in_graph="ipex_prepack::mkl_sgemm_run")
+                    kind_in_graph="ipex_prepack::linear_%s_run" % ipex_eltwise_op,
+                    prec=prec)
+
                 if bf16_supported:
                     self._test_output_bf16(
                         m,
@@ -2836,10 +2854,16 @@ def test_output_linear_add(self):
             LinearAdd(3, 32, bias=True),
             torch.rand(32, 3),
             kind_in_graph="aten::linear")
-        self._test_mkl_fp32(
-            LinearAdd(3, 32, bias=True),
-            torch.rand(32, 3),
-            kind_in_graph="ipex_prepack::mkl_sgemm_run")
+
+        blas_backend = {"mkl":"ipex_prepack::mkl_sgemm_run", "dnnl":"ipex_prepack::linear_run"}
+        for _blas in blas_backend.keys():
+            ipex._set_blas_backend(_blas)
+            self._test_blas_backend_fp32(
+                LinearAdd(3, 32, bias=True),
+                torch.rand(32, 3),
+                kind_in_graph=blas_backend[_blas])
+        ipex._set_blas_backend("dnnl")
+
         self._test_output_bf16(
             LinearAdd(3, 32, bias=True),
             torch.rand(32, 3),
@@ -2855,10 +2879,16 @@ def test_output_linear_add_relu(self):
                 m,
                 x,
                 kind_in_graph="aten::linear")
-            self._test_mkl_fp32(
-                m,
-                x,
-                kind_in_graph="ipex_prepack::mkl_sgemm_run")
+
+            blas_backend = {"mkl":"ipex_prepack::mkl_sgemm_run", "dnnl":"ipex_prepack::linear_run"}
+            for _blas in blas_backend.keys():
+                ipex._set_blas_backend(_blas)
+                self._test_blas_backend_fp32(
+                    m,
+                    x,
+                    kind_in_graph=blas_backend[_blas])
+            ipex._set_blas_backend("dnnl")
+
             self._test_output_bf16(
                 m,
                 x,
@@ -2885,14 +2915,21 @@ def test_output_linear_reshape_bn(self):
             kind_in_graph="aten::linear")
 
     def test_output_linear_swish(self):
-        self._test_mkl_fp32(
-            LinearSigmoidMul(3, 32, bias=True),
-            torch.rand(32, 3),
-            kind_in_graph="ipex_prepack::mkl_sgemm_run")
-        self._test_mkl_fp32(
-            LinearSigmoidMul(3, 32, bias=False),
-            torch.rand(32, 3),
-            kind_in_graph="ipex_prepack::mkl_sgemm_run")
+
+        blas_backend = {"mkl":"ipex_prepack::mkl_sgemm_run", "dnnl":"ipex_prepack::linear_swish_run"}
+        for _blas in blas_backend.keys():
+            ipex._set_blas_backend(_blas)
+
+            self._test_blas_backend_fp32(
+                LinearSigmoidMul(3, 32, bias=True),
+                torch.rand(32, 3),
+                kind_in_graph=blas_backend[_blas])
+            self._test_blas_backend_fp32(
+                LinearSigmoidMul(3, 32, bias=False),
+                torch.rand(32, 3),
+                kind_in_graph=blas_backend[_blas])
+        ipex._set_blas_backend("dnnl")
+
         self._test_output_bf16(
             LinearSigmoidMul(3, 32, bias=True),
             torch.rand(32, 3),
diff --git a/tests/cpu/test_weight_prepack.py b/tests/cpu/test_weight_prepack.py
@@ -182,7 +182,7 @@ def _test_convolution_training_base(self, dim, dtype, rtol=None, atol=None):
 
     def test_conv2d_training(self):
         self._test_convolution_training_base(dim=2, dtype=torch.float)
-        if core.onednn_has_bf16_support(): 
+        if core.onednn_has_bf16_support():
             self._test_convolution_training_base(dim=2, dtype=torch.bfloat16, rtol=1e-2, atol=1e-03)
 
         # TODO: add inference case.
@@ -436,6 +436,47 @@ def test_resnext50_32x4d(self):
         model = torchvision.models.resnet.resnext50_32x4d(pretrained=False)
         self._test_imagenet_model(model)
 
+    def test_blas_backend(self):
+        class L(torch.nn.Module):
+            def __init__(self, in_f, out_f, bias):
+                super(L, self).__init__()
+                self.linear = torch.nn.Linear(in_f, out_f, bias=bias)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        out_features = torch.randint(3, 10, (1,)).item()
+        in_features = torch.randint(3, 10, (1,)).item()
+
+        input_shape = (8, in_features)
+        x = torch.randn(input_shape, dtype=torch.float32)
+        model = L(in_features, out_features, True)
+        origin_model = copy.deepcopy(model).eval()
+
+        def test_dnnl():
+            self.assertTrue(ipex._is_dnnl_blas_backend())
+            ipex_model_dnnl = ipex.optimize(origin_model, dtype=torch.float32, level='O1', auto_kernel_selection=True)
+            with torch.no_grad():
+                dnnl_graph = torch.jit.trace(ipex_model_dnnl.eval(), x)
+                dnnl_graph = torch.jit.freeze(dnnl_graph)
+                dnnl_graph(x)
+                trace_graph = dnnl_graph.graph_for(x)
+                self.assertTrue(any(n.kind() == "ipex_prepack::linear_run" for n in trace_graph.nodes()))
+        test_dnnl()
+
+        ipex._set_blas_backend("mkl")
+        self.assertTrue(ipex._is_mkl_blas_backend())
+        ipex_model_dnnl = ipex.optimize(origin_model, dtype=torch.float32, level='O1', auto_kernel_selection=True)
+        with torch.no_grad():
+            dnnl_graph = torch.jit.trace(ipex_model_dnnl.eval(), x)
+            dnnl_graph = torch.jit.freeze(dnnl_graph)
+            dnnl_graph(x)
+            trace_graph = dnnl_graph.graph_for(x)
+            self.assertTrue(any(n.kind() == "ipex_prepack::mkl_sgemm_run" for n in trace_graph.nodes()))
+
+        ipex._set_blas_backend("dnnl")
+        test_dnnl()
+
     def test_linear_inference(self):
         class L(torch.nn.Module):
             def __init__(self, in_f, out_f, bias):
@@ -479,7 +520,7 @@ def test_linear_training(self):
         input_shapes = []
         for s in in_feature:
             input_shapes += [(128, s), (2, 64, s), (2, 2, 32, s)]
-        
+
         options = itertools.product(out_feature, [True, False], input_shapes, [torch.bfloat16], [True, False])
         for out_features, bias, x_shape, dtype, feed_sample_input in options:
             in_features = x_shape[-1]
@@ -564,12 +605,12 @@ def _deconv_with_output_padding(self):
             "groups": 1,
             "dilation": 3,
         }
-        
+
         params_list = []
 
         for key, value in params_dict.items():
             params_list.append(value)
-        return params_list        
+        return params_list
 
     # mkldnn does not support the case where:
     # padding - output_padding + stride <= 0
@@ -594,7 +635,7 @@ def _deconv_fallback_shape(self):
 
         for key, value in params_dict.items():
             params_list.append(value)
-        return params_list        
+        return params_list
 
     def _test_deconv(self, dims, inference):
         class Deconv2d(torch.nn.Module):
@@ -667,14 +708,14 @@ def forward(self, x):
                             ipex_model, ipex_optimizer = ipex.optimize(origin_model, dtype=dtype, optimizer=origin_optimizer, level='O1', sample_input=x)
                         else:
                             ipex_model, ipex_optimizer = ipex.optimize(origin_model, dtype=dtype, optimizer=origin_optimizer, level='O1')
-                        
+
                         if padding - output_padding + stride <= 0:
                             # unsupported in mkldnn, should not replace the original ConvTranspose module
                             self.assertTrue(module_found(ipex_model, torch.nn.ConvTranspose2d if dims == 2 else torch.nn.ConvTranspose3d))
                             continue
                         else:
-                            self.assertFalse(module_found(ipex_model, torch.nn.ConvTranspose2d if dims == 2 else torch.nn.ConvTranspose3d))                        
-                        
+                            self.assertFalse(module_found(ipex_model, torch.nn.ConvTranspose2d if dims == 2 else torch.nn.ConvTranspose3d))
+
                         x1 = x.clone().requires_grad_()
                         x2 = x.clone().requires_grad_()