diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index c2ee13a0fe99..8a766804ad6f 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit c2ee13a0fe99241b0e798ce647acf98e237f1d0c
+Subproject commit 8a766804ad6ff14b1164fe922c6fe54c131bb02b
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 09511cf9fe5f..21cc7de0dc9f 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 09511cf9fe5ff103900a5eafb50870dc84cc17c8
+Subproject commit 21cc7de0dc9fd6acb796e1be6181fa8e6b6c8f41
diff --git a/3rdparty/rang b/3rdparty/rang
index cabe04d6d6b0..22345aa4c468 160000
--- a/3rdparty/rang
+++ b/3rdparty/rang
@@ -1 +1 @@
-Subproject commit cabe04d6d6b05356fa8f9741704924788f0dd762
+Subproject commit 22345aa4c468db3bd4a0e64a47722aad3518cc81
diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index a88c1305b560..a7d56672e0ce 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -191,6 +191,7 @@ class SketchPolicy(SearchPolicy):
         "max_innermost_split_factor": 64,
         "max_vectorize_size": 16,
         "disable_change_compute_location": 0,
+        "ablated_rule_names" : ["RuleCrossThreadReduction", "MutateAutoUnroll"],
     }
 
     def __init__(
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index 4a4ab18b5eed..afcab3939b07 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -80,6 +80,7 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
   node->verbose = verbose;
   node->sample_init_min_pop_ =
       GetIntParam(node->params, SketchParamKey::SampleInitPopulation::min_population);
+  auto ablated_rules = GetIterNameSetParam(node->params, SketchParamKey::ablated_rule_names);
 
   if (init_search_callbacks) {
     PrintTitle("Call init-search callbacks", verbose);
@@ -153,6 +154,49 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
     LOG(FATAL) << "No default sketch rules for target: " << task->target;
   }
 
+  // ablating specified rules to measure their impacts
+  int vp = 0;
+  for (auto* rule : node->sketch_rules) {
+      if (ablated_rules.count(rule->GetRuleName())) {
+          StdCout(verbose) << "Albating sketch rule: " << rule->GetRuleName() << std::endl;
+      } else {
+          node->sketch_rules[vp++] = rule;
+          StdCout(verbose) << "Enable sketch rule: " << rule->GetRuleName() << std::endl;
+      }
+  }
+  if (vp < node->sketch_rules.size()) {
+      node->sketch_rules.erase(node->sketch_rules.begin() + vp, node->sketch_rules.end());
+      StdCout(verbose) << "Sketch rule size: " << node->sketch_rules.size();
+  }
+
+  vp = 0;
+  for (auto* rule : node->init_rules) {
+      if (ablated_rules.count(rule->GetRuleName())) {
+          StdCout(verbose) << "Albating init rule: " << rule->GetRuleName() << std::endl;
+      } else {
+          node->init_rules[vp++] = rule;
+          StdCout(verbose) << "Enable init rule: " << rule->GetRuleName() << std::endl;
+      }
+  }
+  if (vp < node->init_rules.size()) {
+      node->init_rules.erase(node->init_rules.begin() + vp, node->init_rules.end());
+      StdCout(verbose) << "Init rule size: " << node->init_rules.size();
+  }
+
+  vp = 0;
+  for (auto rule : node->mutation_rules) {
+      if (ablated_rules.count(rule->GetRuleName())) {
+          StdCout(verbose) << "Albating mutation rule: " << rule->GetRuleName() << std::endl;
+      } else {
+          node->mutation_rules[vp++] = rule;
+          StdCout(verbose) << "Enable mutation rule: " << rule->GetRuleName() << std::endl;
+      }
+  }
+  if (vp < node->mutation_rules.size()) {
+      node->mutation_rules.erase(node->mutation_rules.begin() + vp, node->mutation_rules.end());
+      StdCout(verbose) << "Mutation rule size: " << node->mutation_rules.size();
+  }
+
   data_ = std::move(node);
 }
 
diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h
index faf058b45b19..2016f04c350b 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.h
+++ b/src/auto_scheduler/search_policy/sketch_policy.h
@@ -85,6 +85,8 @@ struct SketchParamKey {
   static constexpr const char* max_vectorize_size = "max_vectorize_size";
   /*! \brief Whether disable compute location changing. */
   static constexpr const char* disable_change_compute_location = "disable_change_compute_location";
+  /*! \brief The list of rules to be ablated */
+  static constexpr const char* ablated_rule_names = "ablated_rule_names";
 };
 
 class SketchPolicy;
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h
index fc1916b8c67d..4d8d6f3e6514 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.h
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.h
@@ -171,6 +171,11 @@ class PopulationGenerationRule {
    */
   virtual ResultKind Apply(SketchPolicyNode* policy, State* state,
                            std::mt19937* rand_gen) const = 0;
+  /*!
+   * \brief Get the name of this rule.
+   * \return A string of the rule name.
+   */
+  virtual std::string GetRuleName() const = 0;
 
   /*! \brief The deconstructor */
   virtual ~PopulationGenerationRule() = default;
@@ -181,6 +186,7 @@ class PopulationGenerationRule {
   class rule_name : public PopulationGenerationRule {                                             \
    public:                                                                                        \
     ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \
+    std::string GetRuleName() const final { return #rule_name; }                                  \
   };
 
 /*! \brief The rule that fills the incomplete SplitSteps. */
@@ -223,6 +229,7 @@ class PopulationMutationRule : public PopulationGenerationRule {
    public:                                                                                        \
     explicit rule_name(double weight) : PopulationMutationRule(weight) {}                         \
     ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \
+    std::string GetRuleName() const final { return #rule_name; }                                  \
   };
 
 /*! \brief The rule that mutates tile size by randomly dividing a tile size by a factor
diff --git a/workspace/ablate_sketch_rule/ablate.sh b/workspace/ablate_sketch_rule/ablate.sh
new file mode 100755
index 000000000000..118d7bb75b3e
--- /dev/null
+++ b/workspace/ablate_sketch_rule/ablate.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#ablated_rules="RuleAddCacheRead RuleSpecialComputeLocationGPU RuleAlwaysInline RuleSimplifyComputeWithConstTensor RuleCrossThreadReduction RuleAddCacheWrite RuleMultiLevelTilingWithFusion RuleMultiLevelTiling InitFillTileSize InitThreadBind InitUnroll MutateTileSize MutateAutoUnroll"
+ablated_rules="RuleAddCacheWrite RuleMultiLevelTilingWithFusion InitFillTileSize"
+
+cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"'
+#echo "Default Tuning with batch_size=64 at: "$(eval $cur_time)
+#python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0615-bs64 > ./log/0615-bs64/default.log 2>&1
+
+echo "Begin ablating rules with bs=16 at: "$(eval $cur_time)
+for rule in $ablated_rules; do
+    log_file=./log/0620-pair/bs16-disable-RuleCrossThreadReduction-$rule.log
+    echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file"
+    python -u tune_network_cuda.py -b 16 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1
+    echo "End at: $(eval $cur_time)"
+done
+
+echo "Begin ablating rules with bs=64 at: "$(eval $cur_time)
+for rule in $ablated_rules; do
+    log_file=./log/0620-pair/bs64-disable-RuleCrossThreadReduction-$rule.log
+    echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file"
+    python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1
+    echo "End at: $(eval $cur_time)"
+done
diff --git a/workspace/ablate_sketch_rule/apply_tuned.py b/workspace/ablate_sketch_rule/apply_tuned.py
new file mode 100644
index 000000000000..ab2c45f4c7c6
--- /dev/null
+++ b/workspace/ablate_sketch_rule/apply_tuned.py
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_executor
+import argparse
+import os
+
+#################################################################
+# Parse arguments
+
+def parse_args():
+    parser = argparse.ArgumentParser("Evaluate tuned result")
+    parser.add_argument(
+        '-b',
+        '--batch_size',
+        type=int,
+        default=16,
+        help='batch size')
+    parser.add_argument(
+        '-d',
+        '--device_id',
+        type=int,
+        default=7,
+        help='device id to be used'
+    )
+    parser.add_argument(
+        '--tuned_dir',
+        default='./result',
+        help='dirname of tuned result stored'
+    )
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+print("Arguments: %s" % args)
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-50"
+batch_size = args.batch_size
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+
+# Get network
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+def apply_tuned(log_file):
+    print("Apply: %s" % log_file)
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+            lib = relay.build(mod, target=target, params=params)
+
+    # Create graph executor
+    dev = tvm.device(str(target), args.device_id)
+    module = graph_executor.GraphModule(lib["default"](dev))
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input("data", data_tvm)
+
+    # Evaluate
+    print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
+
+for root, dirs, files in os.walk(args.tuned_dir):
+    for file_name in files:
+        log_file = os.path.join(root, file_name)
+        apply_tuned(log_file)
diff --git a/workspace/ablate_sketch_rule/evaluate.sh b/workspace/ablate_sketch_rule/evaluate.sh
new file mode 100755
index 000000000000..a53ac360f930
--- /dev/null
+++ b/workspace/ablate_sketch_rule/evaluate.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"'
+echo "Begin evaluationg on: "$(eval $cur_time)
+#python -u no_schedule.py -b 16 -d 7
+#python -u apply_tuned.py -b 64 -d 1 --tuned_dir ./result/0615-bs64
+log_file=./log/bs64-default.debug
+python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-.json > $log_file 2>&1 &
+#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-InitThreadBind.json > $log_file 2>&1 &
+#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-MutateAutoUnroll.json > $log_file 2>&1 &
+echo "End at: $(eval $cur_time)"
diff --git a/workspace/ablate_sketch_rule/print_best.py b/workspace/ablate_sketch_rule/print_best.py
new file mode 100644
index 000000000000..f1ac8dd6560c
--- /dev/null
+++ b/workspace/ablate_sketch_rule/print_best.py
@@ -0,0 +1,255 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_executor
+import argparse
+import os
+
+#################################################################
+# Parse arguments
+
+def parse_args():
+    parser = argparse.ArgumentParser("Evaluate tuned result")
+    parser.add_argument(
+        '-b',
+        '--batch_size',
+        type=int,
+        default=16,
+        help='batch size')
+    parser.add_argument(
+        '-d',
+        '--device_id',
+        type=int,
+        default=7,
+        help='device id to be used'
+    )
+    parser.add_argument(
+        '--tuned_dir',
+        default='./result',
+        help='dirname of tuned result stored'
+    )
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+print("Arguments: %s" % args)
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-50"
+batch_size = args.batch_size
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+dev = tvm.device(str(target), args.device_id)
+
+def debug_tuned_result(log_file):
+    for idx, task in enumerate(tasks):
+        print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+        print("Weight:%f" % task_weights[idx])
+        compute_dag = task.compute_dag
+        print("DAG------->")
+        print(compute_dag)
+        #sch, args = task.apply_best(log_file)
+        inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key)
+        if inp is None:
+            print("!!!Can't find tuned schedule, skip")
+            continue
+            #sch, tensors = compute_dag.apply_steps_from_state(compute_dag.get_init_state())
+        else:
+            sch, tensors = compute_dag.apply_steps_from_state(inp.state)
+        lowered_module = tvm.lower(sch, tensors, simple_mode=True)
+        print("TIR------->")
+        print(lowered_module)
+        print("TIR AST------->")
+        print(lowered_module.astext())
+        func = tvm.build(sch, tensors, target)
+        print("CUDA------->")
+        #print(task.print_best(log_file, print_mode="cuda"))
+        print(func.imported_modules[0].get_source())
+        input_data = []
+        for tensor in tensors:
+            shape = auto_scheduler.utils.get_const_tuple(tensor.shape)
+            xd = tvm.nd.array((np.random.uniform(size=shape)).astype(dtype), device=dev)
+            input_data.append(xd)
+        evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
+        print("Execution time of this task: %.3f ms" % (np.median(evaluator(*input_data).results) * 1000))
+
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+def apply_tuned(log_file):
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+            lib = relay.build(mod, target=target, params=params)
+
+    # Create graph executor
+    dev = tvm.device(str(target), args.device_id)
+    module = graph_executor.GraphModule(lib["default"](dev))
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input("data", data_tvm)
+
+    # Evaluate
+    print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
+
+if os.path.isdir(args.tuned_dir):
+    for root, dirs, files in os.walk(args.tuned_dir):
+        for file_name in files:
+            print("Apply file: %s" % log_file)
+            log_file = os.path.join(root, file_name)
+            debug_tuned_result(log_file)
+        #apply_tuned(log_file)
+else:
+    log_file = args.tuned_dir
+    print("Apply file: %s" % log_file)
+    debug_tuned_result(log_file)
+
diff --git a/workspace/ablate_sketch_rule/tune_network_cuda.py b/workspace/ablate_sketch_rule/tune_network_cuda.py
new file mode 100644
index 000000000000..e7074a51739e
--- /dev/null
+++ b/workspace/ablate_sketch_rule/tune_network_cuda.py
@@ -0,0 +1,355 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_executor
+import argparse
+import os
+
+#################################################################
+# Parse arguments
+
+def parse_args():
+    parser = argparse.ArgumentParser("Tuning arguments")
+    parser.add_argument(
+        '-b',
+        '--batch_size',
+        type=int,
+        default=16,
+        help='batch size')
+    parser.add_argument(
+        '-d',
+        '--device_id',
+        type=int,
+        default=7,
+        help='device id to be used'
+    )
+    parser.add_argument(
+        '-n',
+        '--num_measure_trials',
+        type=int,
+        default=300,
+        help='number of trials to be measured'
+    )
+    parser.add_argument(
+        '--tuned_dir',
+        default='./result',
+        help='dirname of tuned result stored'
+    )
+    parser.add_argument(
+        '-e',
+        '--ablated_rules',
+        action='append',
+        default=[],
+        help='names of rules to be ablated')
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+print("Arguments: %s" % args)
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-50"
+batch_size = args.batch_size
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+log_name = "%s-%s-B%d-%s.disable-%s.json" % (network, layout, batch_size, target.kind.name, '_'.join(args.ablated_rules))
+log_file = os.path.join(args.tuned_dir, log_name)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+for idx, task in enumerate(tasks):
+    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+    print(task.compute_dag)
+
+#################################################################
+# Begin Tuning
+# ------------
+# Now, we set some options for tuning and launch the search tasks
+#
+# * :code:`measure_ctx` launches a different process for measurement to
+#   provide isolation. It can protect the main process from GPU crashes
+#   during measurement and avoid other runtime conflicts.
+# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
+#   This can warmup the GPU, which is necessary to get accurate measurement results.
+#   Typically, we recommend a value >= 300 ms.
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
+#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
+#   In practice, we recommend setting it around :code:`900 * len(tasks)`,
+#   which is typically enough for the search to converge.
+#   For example, there are 24 tasks in resnet-18, so we can set it as 20000.
+#   You can adjust this parameter according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
+#
+
+
+def run_tuning():
+    print("Begin tuning...")
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(n_parallel=2, repeat=1, min_repeat_ms=300, timeout=10, device=args.device_id)
+
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=args.num_measure_trials,  # change this to 20000 to achieve the best performance
+        runner=measure_ctx.runner,
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option, search_policy_params={'ablated_rule_names' : args.ablated_rules})
+
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+run_tuning()
+
+
+######################################################################
+# .. note:: Explain the printed information during tuning
+#
+#   During the tuning, a lot of information will be printed on the console.
+#   They are used for debugging purposes. The most important info is the output
+#   of the task scheduler. The following table is a sample output.
+#
+#   .. code-block:: c
+#
+#     ----------------------------------------------------------------------
+#     ------------------------------  [ Task Scheduler ]
+#     ----------------------------------------------------------------------
+#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#     -------------------------------------------------
+#     |    0 |        0.005 |           0.88 |     64 |
+#     |    1 |        0.010 |          99.10 |     64 |
+#     |    2 |        0.006 |           0.00 |     64 |
+#     |    3 |        0.145 |         979.78 |    384 |
+#     |    4 |        0.130 |        1097.02 |    384 |
+#     |    5 |        0.143 |         992.69 |    384 |
+#     |    6 |        0.076 |        1526.86 |    192 |
+#     |    7 |        0.115 |         999.44 |    320 |
+#     |    8 |        0.079 |        1449.39 |    320 |
+#     |    9 |        0.122 |         938.73 |    384 |
+#     |   10 |        0.063 |        1832.98 |    192 |
+#     |   11 |        0.072 |        1763.62 |    256 |
+#     |   12 |        0.062 |        2036.40 |    192 |
+#     |   13 |        0.068 |        1874.44 |    192 |
+#     |   14 |        0.049 |        2346.50 |    128 |
+#     |   15 |        0.076 |        1694.31 |    256 |
+#     |   16 |        0.067 |        1933.30 |    448 |
+#     |   17 |        0.076 |        1680.90 |    256 |
+#     |   18 |        0.022 |          98.43 |     64 |
+#     |   19 |        0.076 |        3112.55 |    192 |
+#     |   20 |        0.013 |        2026.44 |     64 |
+#     |   21 |        0.011 |        1136.69 |     64 |
+#     |   22 |        0.013 |         992.47 |     64 |
+#     |   23 |        0.020 |         627.56 |     64 |
+#     -------------------------------------------------
+#     Estimated total latency: 1.587 ms  Trials: 4992  Used time : 13296 s  Next ID: 3
+#
+#   This table lists the latency and (estimated) speed of all tasks.
+#   It also lists the allocation of measurement trials for all tasks.
+#   The last line prints the total weighted latency of these tasks,
+#   which can be a rough estimation of the end-to-end execution time
+#   of the network.
+#   The last line also prints the total number of measurement trials,
+#   total time spent on auto-tuning and the id of the next task to tune.
+#
+#   There will also be some "tvm::Error"s and CUDA errors, because the
+#   auto-scheduler will try some invalid schedules.
+#   You can safely ignore them if the tuning can continue, because these
+#   errors are isolated from the main process.
+#
+
+######################################################################
+# .. note:: Terminate the tuning earlier
+#
+#   You can terminate the tuning earlier by forcibly killing this process.
+#   As long as you get at least one valid schedule for each task in the log file,
+#   you should be able to do the compilation (the secion below).
+#
+
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+print("Compile...")
+with auto_scheduler.ApplyHistoryBest(log_file):
+    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+        lib = relay.build(mod, target=target, params=params)
+
+# Create graph executor
+dev = tvm.device(str(target), args.device_id)
+module = graph_executor.GraphModule(lib["default"](dev))
+data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+module.set_input("data", data_tvm)
+
+# Evaluate
+print("Evaluate inference time cost...")
+print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
+
+
+#################################################################
+# Other Tips
+# ----------
+# 1. During the tuning, the auto-scheduler needs to compile many programs and
+#    extract feature from them. This part is CPU-intensive,
+#    so a high-performance CPU with many cores is recommended for faster search.
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
+#    to distill the large log file and only save the best useful records.
+# 3. You can resume a search from the previous log file. You just need to
+#    add a new argument :code:`load_log_file` when creating the task scheduler
+#    in function :code:`run_tuning`. Say,
+#    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
+# 4. If you have multiple target GPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-scale-up-rpc-tracker>`
+#    to learn how to use the RPC Tracker and RPC Server.
+#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
+#    with :any:`auto_scheduler.RPCRunner`.