From 8c3b97dfe3b1662472616b2168659759eee15cc1 Mon Sep 17 00:00:00 2001
From: CtfGo <ctfeng66@163.com>
Date: Tue, 14 Jun 2022 06:14:09 +0000
Subject: [PATCH 1/4] ablation study draft code

---
 python/tvm/auto_scheduler/search_policy.py    |   1 +
 .../search_policy/sketch_policy.cc            |  44 +++
 .../search_policy/sketch_policy.h             |   2 +
 .../search_policy/sketch_policy_rules.h       |   7 +
 workspace/ablation_study_on_rule/ablate.sh    |  17 +
 .../ablation_study_on_rule/apply_tuned.py     | 200 ++++++++++
 workspace/ablation_study_on_rule/evaluate.sh  |   5 +
 .../tune_network_cuda.py                      | 355 ++++++++++++++++++
 8 files changed, 631 insertions(+)
 create mode 100755 workspace/ablation_study_on_rule/ablate.sh
 create mode 100644 workspace/ablation_study_on_rule/apply_tuned.py
 create mode 100755 workspace/ablation_study_on_rule/evaluate.sh
 create mode 100644 workspace/ablation_study_on_rule/tune_network_cuda.py

diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index a88c1305b560..a7d56672e0ce 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -191,6 +191,7 @@ class SketchPolicy(SearchPolicy):
         "max_innermost_split_factor": 64,
         "max_vectorize_size": 16,
         "disable_change_compute_location": 0,
+        "ablated_rule_names" : ["RuleCrossThreadReduction", "MutateAutoUnroll"],
     }
 
     def __init__(
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index 4a4ab18b5eed..afcab3939b07 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -80,6 +80,7 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
   node->verbose = verbose;
   node->sample_init_min_pop_ =
       GetIntParam(node->params, SketchParamKey::SampleInitPopulation::min_population);
+  auto ablated_rules = GetIterNameSetParam(node->params, SketchParamKey::ablated_rule_names);
 
   if (init_search_callbacks) {
     PrintTitle("Call init-search callbacks", verbose);
@@ -153,6 +154,49 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
     LOG(FATAL) << "No default sketch rules for target: " << task->target;
   }
 
+  // ablating specified rules to measure their impacts
+  int vp = 0;
+  for (auto* rule : node->sketch_rules) {
+      if (ablated_rules.count(rule->GetRuleName())) {
+          StdCout(verbose) << "Albating sketch rule: " << rule->GetRuleName() << std::endl;
+      } else {
+          node->sketch_rules[vp++] = rule;
+          StdCout(verbose) << "Enable sketch rule: " << rule->GetRuleName() << std::endl;
+      }
+  }
+  if (vp < node->sketch_rules.size()) {
+      node->sketch_rules.erase(node->sketch_rules.begin() + vp, node->sketch_rules.end());
+      StdCout(verbose) << "Sketch rule size: " << node->sketch_rules.size();
+  }
+
+  vp = 0;
+  for (auto* rule : node->init_rules) {
+      if (ablated_rules.count(rule->GetRuleName())) {
+          StdCout(verbose) << "Albating init rule: " << rule->GetRuleName() << std::endl;
+      } else {
+          node->init_rules[vp++] = rule;
+          StdCout(verbose) << "Enable init rule: " << rule->GetRuleName() << std::endl;
+      }
+  }
+  if (vp < node->init_rules.size()) {
+      node->init_rules.erase(node->init_rules.begin() + vp, node->init_rules.end());
+      StdCout(verbose) << "Init rule size: " << node->init_rules.size();
+  }
+
+  vp = 0;
+  for (auto rule : node->mutation_rules) {
+      if (ablated_rules.count(rule->GetRuleName())) {
+          StdCout(verbose) << "Albating mutation rule: " << rule->GetRuleName() << std::endl;
+      } else {
+          node->mutation_rules[vp++] = rule;
+          StdCout(verbose) << "Enable mutation rule: " << rule->GetRuleName() << std::endl;
+      }
+  }
+  if (vp < node->mutation_rules.size()) {
+      node->mutation_rules.erase(node->mutation_rules.begin() + vp, node->mutation_rules.end());
+      StdCout(verbose) << "Mutation rule size: " << node->mutation_rules.size();
+  }
+
   data_ = std::move(node);
 }
 
diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h
index faf058b45b19..2016f04c350b 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.h
+++ b/src/auto_scheduler/search_policy/sketch_policy.h
@@ -85,6 +85,8 @@ struct SketchParamKey {
   static constexpr const char* max_vectorize_size = "max_vectorize_size";
   /*! \brief Whether disable compute location changing. */
   static constexpr const char* disable_change_compute_location = "disable_change_compute_location";
+  /*! \brief The list of rules to be ablated */
+  static constexpr const char* ablated_rule_names = "ablated_rule_names";
 };
 
 class SketchPolicy;
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h
index fc1916b8c67d..4d8d6f3e6514 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.h
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.h
@@ -171,6 +171,11 @@ class PopulationGenerationRule {
    */
   virtual ResultKind Apply(SketchPolicyNode* policy, State* state,
                            std::mt19937* rand_gen) const = 0;
+  /*!
+   * \brief Get the name of this rule.
+   * \return A string of the rule name.
+   */
+  virtual std::string GetRuleName() const = 0;
 
   /*! \brief The deconstructor */
   virtual ~PopulationGenerationRule() = default;
@@ -181,6 +186,7 @@ class PopulationGenerationRule {
   class rule_name : public PopulationGenerationRule {                                             \
    public:                                                                                        \
     ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \
+    std::string GetRuleName() const final { return #rule_name; }                                  \
   };
 
 /*! \brief The rule that fills the incomplete SplitSteps. */
@@ -223,6 +229,7 @@ class PopulationMutationRule : public PopulationGenerationRule {
    public:                                                                                        \
     explicit rule_name(double weight) : PopulationMutationRule(weight) {}                         \
     ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \
+    std::string GetRuleName() const final { return #rule_name; }                                  \
   };
 
 /*! \brief The rule that mutates tile size by randomly dividing a tile size by a factor
diff --git a/workspace/ablation_study_on_rule/ablate.sh b/workspace/ablation_study_on_rule/ablate.sh
new file mode 100755
index 000000000000..a4fdeb63ce74
--- /dev/null
+++ b/workspace/ablation_study_on_rule/ablate.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+ablated_rules="RuleAddCacheRead RuleSpecialComputeLocationGPU RuleAlwaysInline RuleSimplifyComputeWithConstTensor RuleCrossThreadReduction RuleAddCacheWrite RuleMultiLevelTilingWithFusion RuleMultiLevelTiling InitFillTileSize InitThreadBind InitUnroll MutateTileSize MutateAutoUnroll"
+
+cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"'
+echo "Default Tuning with bathc_size=16 at: "$(eval $cur_time)
+python -u tune_network_cuda.py -b 16 -d 6 -n 300 --tuned_dir ./result/0613-bs16 > ./log/0613-bs16/default.log 2>&1
+echo "Default Tuning with bathc_size=64 at: "$(eval $cur_time)
+python -u tune_network_cuda.py -b 64 -d 6 -n 3000 --tuned_dir ./result/0614-bs64 > ./log/0614-bs64/default.log 2>&1
+
+echo "Begin ablating rules at: "$(eval $cur_time)
+for rule in $ablated_rules; do
+    log_file=./log/0614-bs64/disable-$rule.log
+    echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file"
+    python -u tune_network_cuda.py -b 64 -d 6 -n 3000 --tuned_dir ./result/0614-bs64 -e $rule > $log_file 2>&1
+    echo "End at: $(eval $cur_time)"
+done
diff --git a/workspace/ablation_study_on_rule/apply_tuned.py b/workspace/ablation_study_on_rule/apply_tuned.py
new file mode 100644
index 000000000000..ab2c45f4c7c6
--- /dev/null
+++ b/workspace/ablation_study_on_rule/apply_tuned.py
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_executor
+import argparse
+import os
+
+#################################################################
+# Parse arguments
+
+def parse_args():
+    parser = argparse.ArgumentParser("Evaluate tuned result")
+    parser.add_argument(
+        '-b',
+        '--batch_size',
+        type=int,
+        default=16,
+        help='batch size')
+    parser.add_argument(
+        '-d',
+        '--device_id',
+        type=int,
+        default=7,
+        help='device id to be used'
+    )
+    parser.add_argument(
+        '--tuned_dir',
+        default='./result',
+        help='dirname of tuned result stored'
+    )
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+print("Arguments: %s" % args)
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-50"
+batch_size = args.batch_size
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+
+# Get network
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+def apply_tuned(log_file):
+    print("Apply: %s" % log_file)
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+            lib = relay.build(mod, target=target, params=params)
+
+    # Create graph executor
+    dev = tvm.device(str(target), args.device_id)
+    module = graph_executor.GraphModule(lib["default"](dev))
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input("data", data_tvm)
+
+    # Evaluate
+    print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
+
+for root, dirs, files in os.walk(args.tuned_dir):
+    for file_name in files:
+        log_file = os.path.join(root, file_name)
+        apply_tuned(log_file)
diff --git a/workspace/ablation_study_on_rule/evaluate.sh b/workspace/ablation_study_on_rule/evaluate.sh
new file mode 100755
index 000000000000..d199d62cfd5b
--- /dev/null
+++ b/workspace/ablation_study_on_rule/evaluate.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"'
+echo "Begin evaluationg on: "$(eval $cur_time)
+echo "End at: $(eval $cur_time)"
diff --git a/workspace/ablation_study_on_rule/tune_network_cuda.py b/workspace/ablation_study_on_rule/tune_network_cuda.py
new file mode 100644
index 000000000000..21d44b0b8a34
--- /dev/null
+++ b/workspace/ablation_study_on_rule/tune_network_cuda.py
@@ -0,0 +1,355 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_executor
+import argparse
+import os
+
+#################################################################
+# Parse arguments
+
+def parse_args():
+    parser = argparse.ArgumentParser("Tuning arguments")
+    parser.add_argument(
+        '-b',
+        '--batch_size',
+        type=int,
+        default=16,
+        help='batch size')
+    parser.add_argument(
+        '-d',
+        '--device_id',
+        type=int,
+        default=7,
+        help='device id to be used'
+    )
+    parser.add_argument(
+        '-n',
+        '--num_measure_trials',
+        type=int,
+        default=300,
+        help='number of trials to be measured'
+    )
+    parser.add_argument(
+        '--tuned_dir',
+        default='./result',
+        help='dirname of tuned result stored'
+    )
+    parser.add_argument(
+        '-e',
+        '--ablated_rules',
+        action='append',
+        default=[],
+        help='names of rules to be ablated')
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+print("Arguments: %s" % args)
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-50"
+batch_size = args.batch_size
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+log_name = "%s-%s-B%d-%s.disable-%s.json" % (network, layout, batch_size, target.kind.name, '_'.join(args.ablated_rules))
+log_file = os.path.join(args.tuned_dir, log_name)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+for idx, task in enumerate(tasks):
+    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+    print(task.compute_dag)
+
+#################################################################
+# Begin Tuning
+# ------------
+# Now, we set some options for tuning and launch the search tasks
+#
+# * :code:`measure_ctx` launches a different process for measurement to
+#   provide isolation. It can protect the main process from GPU crashes
+#   during measurement and avoid other runtime conflicts.
+# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
+#   This can warmup the GPU, which is necessary to get accurate measurement results.
+#   Typically, we recommend a value >= 300 ms.
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
+#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
+#   In practice, we recommend setting it around :code:`900 * len(tasks)`,
+#   which is typically enough for the search to converge.
+#   For example, there are 24 tasks in resnet-18, so we can set it as 20000.
+#   You can adjust this parameter according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
+#
+
+
+def run_tuning():
+    print("Begin tuning...")
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
+
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=args.num_measure_trials,  # change this to 20000 to achieve the best performance
+        runner=measure_ctx.runner,
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option, search_policy_params={'ablated_rule_names' : args.ablated_rules})
+
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+run_tuning()
+
+
+######################################################################
+# .. note:: Explain the printed information during tuning
+#
+#   During the tuning, a lot of information will be printed on the console.
+#   They are used for debugging purposes. The most important info is the output
+#   of the task scheduler. The following table is a sample output.
+#
+#   .. code-block:: c
+#
+#     ----------------------------------------------------------------------
+#     ------------------------------  [ Task Scheduler ]
+#     ----------------------------------------------------------------------
+#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#     -------------------------------------------------
+#     |    0 |        0.005 |           0.88 |     64 |
+#     |    1 |        0.010 |          99.10 |     64 |
+#     |    2 |        0.006 |           0.00 |     64 |
+#     |    3 |        0.145 |         979.78 |    384 |
+#     |    4 |        0.130 |        1097.02 |    384 |
+#     |    5 |        0.143 |         992.69 |    384 |
+#     |    6 |        0.076 |        1526.86 |    192 |
+#     |    7 |        0.115 |         999.44 |    320 |
+#     |    8 |        0.079 |        1449.39 |    320 |
+#     |    9 |        0.122 |         938.73 |    384 |
+#     |   10 |        0.063 |        1832.98 |    192 |
+#     |   11 |        0.072 |        1763.62 |    256 |
+#     |   12 |        0.062 |        2036.40 |    192 |
+#     |   13 |        0.068 |        1874.44 |    192 |
+#     |   14 |        0.049 |        2346.50 |    128 |
+#     |   15 |        0.076 |        1694.31 |    256 |
+#     |   16 |        0.067 |        1933.30 |    448 |
+#     |   17 |        0.076 |        1680.90 |    256 |
+#     |   18 |        0.022 |          98.43 |     64 |
+#     |   19 |        0.076 |        3112.55 |    192 |
+#     |   20 |        0.013 |        2026.44 |     64 |
+#     |   21 |        0.011 |        1136.69 |     64 |
+#     |   22 |        0.013 |         992.47 |     64 |
+#     |   23 |        0.020 |         627.56 |     64 |
+#     -------------------------------------------------
+#     Estimated total latency: 1.587 ms  Trials: 4992  Used time : 13296 s  Next ID: 3
+#
+#   This table lists the latency and (estimated) speed of all tasks.
+#   It also lists the allocation of measurement trials for all tasks.
+#   The last line prints the total weighted latency of these tasks,
+#   which can be a rough estimation of the end-to-end execution time
+#   of the network.
+#   The last line also prints the total number of measurement trials,
+#   total time spent on auto-tuning and the id of the next task to tune.
+#
+#   There will also be some "tvm::Error"s and CUDA errors, because the
+#   auto-scheduler will try some invalid schedules.
+#   You can safely ignore them if the tuning can continue, because these
+#   errors are isolated from the main process.
+#
+
+######################################################################
+# .. note:: Terminate the tuning earlier
+#
+#   You can terminate the tuning earlier by forcibly killing this process.
+#   As long as you get at least one valid schedule for each task in the log file,
+#   you should be able to do the compilation (the secion below).
+#
+
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+print("Compile...")
+with auto_scheduler.ApplyHistoryBest(log_file):
+    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+        lib = relay.build(mod, target=target, params=params)
+
+# Create graph executor
+dev = tvm.device(str(target), args.device_id)
+module = graph_executor.GraphModule(lib["default"](dev))
+data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+module.set_input("data", data_tvm)
+
+# Evaluate
+print("Evaluate inference time cost...")
+print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
+
+
+#################################################################
+# Other Tips
+# ----------
+# 1. During the tuning, the auto-scheduler needs to compile many programs and
+#    extract feature from them. This part is CPU-intensive,
+#    so a high-performance CPU with many cores is recommended for faster search.
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
+#    to distill the large log file and only save the best useful records.
+# 3. You can resume a search from the previous log file. You just need to
+#    add a new argument :code:`load_log_file` when creating the task scheduler
+#    in function :code:`run_tuning`. Say,
+#    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
+# 4. If you have multiple target GPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-scale-up-rpc-tracker>`
+#    to learn how to use the RPC Tracker and RPC Server.
+#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
+#    with :any:`auto_scheduler.RPCRunner`.

From da68e96c4d60f3562a317d972887749fa094f511 Mon Sep 17 00:00:00 2001
From: CtfGo <ctfeng66@163.com>
Date: Mon, 11 Jul 2022 12:48:21 +0000
Subject: [PATCH 2/4] push local files

---
 workspace/ablation_study_on_rule/ablate.sh    |  23 +-
 workspace/ablation_study_on_rule/evaluate.sh  |   6 +
 .../ablation_study_on_rule/no_schedule.py     | 354 ++++++++++++++++++
 .../ablation_study_on_rule/print_best.py      | 255 +++++++++++++
 .../tune_network_cuda.py                      |   2 +-
 workspace/compile.sh                          |   3 +
 workspace/default_resnet50.py                 | 310 +++++++++++++++
 workspace/tune_network_cuda.py                | 310 +++++++++++++++
 8 files changed, 1254 insertions(+), 9 deletions(-)
 create mode 100644 workspace/ablation_study_on_rule/no_schedule.py
 create mode 100644 workspace/ablation_study_on_rule/print_best.py
 create mode 100644 workspace/compile.sh
 create mode 100644 workspace/default_resnet50.py
 create mode 100644 workspace/tune_network_cuda.py

diff --git a/workspace/ablation_study_on_rule/ablate.sh b/workspace/ablation_study_on_rule/ablate.sh
index a4fdeb63ce74..118d7bb75b3e 100755
--- a/workspace/ablation_study_on_rule/ablate.sh
+++ b/workspace/ablation_study_on_rule/ablate.sh
@@ -1,17 +1,24 @@
 #!/bin/bash
 
-ablated_rules="RuleAddCacheRead RuleSpecialComputeLocationGPU RuleAlwaysInline RuleSimplifyComputeWithConstTensor RuleCrossThreadReduction RuleAddCacheWrite RuleMultiLevelTilingWithFusion RuleMultiLevelTiling InitFillTileSize InitThreadBind InitUnroll MutateTileSize MutateAutoUnroll"
+#ablated_rules="RuleAddCacheRead RuleSpecialComputeLocationGPU RuleAlwaysInline RuleSimplifyComputeWithConstTensor RuleCrossThreadReduction RuleAddCacheWrite RuleMultiLevelTilingWithFusion RuleMultiLevelTiling InitFillTileSize InitThreadBind InitUnroll MutateTileSize MutateAutoUnroll"
+ablated_rules="RuleAddCacheWrite RuleMultiLevelTilingWithFusion InitFillTileSize"
 
 cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"'
-echo "Default Tuning with bathc_size=16 at: "$(eval $cur_time)
-python -u tune_network_cuda.py -b 16 -d 6 -n 300 --tuned_dir ./result/0613-bs16 > ./log/0613-bs16/default.log 2>&1
-echo "Default Tuning with bathc_size=64 at: "$(eval $cur_time)
-python -u tune_network_cuda.py -b 64 -d 6 -n 3000 --tuned_dir ./result/0614-bs64 > ./log/0614-bs64/default.log 2>&1
+#echo "Default Tuning with batch_size=64 at: "$(eval $cur_time)
+#python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0615-bs64 > ./log/0615-bs64/default.log 2>&1
 
-echo "Begin ablating rules at: "$(eval $cur_time)
+echo "Begin ablating rules with bs=16 at: "$(eval $cur_time)
 for rule in $ablated_rules; do
-    log_file=./log/0614-bs64/disable-$rule.log
+    log_file=./log/0620-pair/bs16-disable-RuleCrossThreadReduction-$rule.log
     echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file"
-    python -u tune_network_cuda.py -b 64 -d 6 -n 3000 --tuned_dir ./result/0614-bs64 -e $rule > $log_file 2>&1
+    python -u tune_network_cuda.py -b 16 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1
+    echo "End at: $(eval $cur_time)"
+done
+
+echo "Begin ablating rules with bs=64 at: "$(eval $cur_time)
+for rule in $ablated_rules; do
+    log_file=./log/0620-pair/bs64-disable-RuleCrossThreadReduction-$rule.log
+    echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file"
+    python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1
     echo "End at: $(eval $cur_time)"
 done
diff --git a/workspace/ablation_study_on_rule/evaluate.sh b/workspace/ablation_study_on_rule/evaluate.sh
index d199d62cfd5b..a53ac360f930 100755
--- a/workspace/ablation_study_on_rule/evaluate.sh
+++ b/workspace/ablation_study_on_rule/evaluate.sh
@@ -2,4 +2,10 @@
 
 cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"'
 echo "Begin evaluationg on: "$(eval $cur_time)
+#python -u no_schedule.py -b 16 -d 7
+#python -u apply_tuned.py -b 64 -d 1 --tuned_dir ./result/0615-bs64
+log_file=./log/bs64-default.debug
+python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-.json > $log_file 2>&1 &
+#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-InitThreadBind.json > $log_file 2>&1 &
+#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-MutateAutoUnroll.json > $log_file 2>&1 &
 echo "End at: $(eval $cur_time)"
diff --git a/workspace/ablation_study_on_rule/no_schedule.py b/workspace/ablation_study_on_rule/no_schedule.py
new file mode 100644
index 000000000000..2007b863e367
--- /dev/null
+++ b/workspace/ablation_study_on_rule/no_schedule.py
@@ -0,0 +1,354 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_executor
+import argparse
+import os
+
+#################################################################
+# Parse arguments
+
+def parse_args():
+    parser = argparse.ArgumentParser("Tuning arguments")
+    parser.add_argument(
+        '-b',
+        '--batch_size',
+        type=int,
+        default=16,
+        help='batch size')
+    parser.add_argument(
+        '-d',
+        '--device_id',
+        type=int,
+        default=7,
+        help='device id to be used'
+    )
+    parser.add_argument(
+        '-n',
+        '--num_measure_trials',
+        type=int,
+        default=300,
+        help='number of trials to be measured'
+    )
+    parser.add_argument(
+        '--tuned_dir',
+        default='./result',
+        help='dirname of tuned result stored'
+    )
+    parser.add_argument(
+        '-e',
+        '--ablated_rules',
+        action='append',
+        default=[],
+        help='names of rules to be ablated')
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+print("Arguments: %s" % args)
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-50"
+batch_size = args.batch_size
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+log_name = "%s-%s-B%d-%s.disable-%s.json" % (network, layout, batch_size, target.kind.name, '_'.join(args.ablated_rules))
+log_file = os.path.join(args.tuned_dir, log_name)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+#tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+#
+#for idx, task in enumerate(tasks):
+#    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+#    print(task.compute_dag)
+
+#################################################################
+# Begin Tuning
+# ------------
+# Now, we set some options for tuning and launch the search tasks
+#
+# * :code:`measure_ctx` launches a different process for measurement to
+#   provide isolation. It can protect the main process from GPU crashes
+#   during measurement and avoid other runtime conflicts.
+# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
+#   This can warmup the GPU, which is necessary to get accurate measurement results.
+#   Typically, we recommend a value >= 300 ms.
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
+#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
+#   In practice, we recommend setting it around :code:`900 * len(tasks)`,
+#   which is typically enough for the search to converge.
+#   For example, there are 24 tasks in resnet-18, so we can set it as 20000.
+#   You can adjust this parameter according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
+#
+
+
+def run_tuning():
+    print("Begin tuning...")
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(n_parallel=2, repeat=1, min_repeat_ms=300, timeout=10, device=args.device_id)
+
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=args.num_measure_trials,  # change this to 20000 to achieve the best performance
+        runner=measure_ctx.runner,
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option, search_policy_params={'ablated_rule_names' : args.ablated_rules})
+
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+#run_tuning()
+
+
+######################################################################
+# .. note:: Explain the printed information during tuning
+#
+#   During the tuning, a lot of information will be printed on the console.
+#   They are used for debugging purposes. The most important info is the output
+#   of the task scheduler. The following table is a sample output.
+#
+#   .. code-block:: c
+#
+#     ----------------------------------------------------------------------
+#     ------------------------------  [ Task Scheduler ]
+#     ----------------------------------------------------------------------
+#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#     -------------------------------------------------
+#     |    0 |        0.005 |           0.88 |     64 |
+#     |    1 |        0.010 |          99.10 |     64 |
+#     |    2 |        0.006 |           0.00 |     64 |
+#     |    3 |        0.145 |         979.78 |    384 |
+#     |    4 |        0.130 |        1097.02 |    384 |
+#     |    5 |        0.143 |         992.69 |    384 |
+#     |    6 |        0.076 |        1526.86 |    192 |
+#     |    7 |        0.115 |         999.44 |    320 |
+#     |    8 |        0.079 |        1449.39 |    320 |
+#     |    9 |        0.122 |         938.73 |    384 |
+#     |   10 |        0.063 |        1832.98 |    192 |
+#     |   11 |        0.072 |        1763.62 |    256 |
+#     |   12 |        0.062 |        2036.40 |    192 |
+#     |   13 |        0.068 |        1874.44 |    192 |
+#     |   14 |        0.049 |        2346.50 |    128 |
+#     |   15 |        0.076 |        1694.31 |    256 |
+#     |   16 |        0.067 |        1933.30 |    448 |
+#     |   17 |        0.076 |        1680.90 |    256 |
+#     |   18 |        0.022 |          98.43 |     64 |
+#     |   19 |        0.076 |        3112.55 |    192 |
+#     |   20 |        0.013 |        2026.44 |     64 |
+#     |   21 |        0.011 |        1136.69 |     64 |
+#     |   22 |        0.013 |         992.47 |     64 |
+#     |   23 |        0.020 |         627.56 |     64 |
+#     -------------------------------------------------
+#     Estimated total latency: 1.587 ms  Trials: 4992  Used time : 13296 s  Next ID: 3
+#
+#   This table lists the latency and (estimated) speed of all tasks.
+#   It also lists the allocation of measurement trials for all tasks.
+#   The last line prints the total weighted latency of these tasks,
+#   which can be a rough estimation of the end-to-end execution time
+#   of the network.
+#   The last line also prints the total number of measurement trials,
+#   total time spent on auto-tuning and the id of the next task to tune.
+#
+#   There will also be some "tvm::Error"s and CUDA errors, because the
+#   auto-scheduler will try some invalid schedules.
+#   You can safely ignore them if the tuning can continue, because these
+#   errors are isolated from the main process.
+#
+
+######################################################################
+# .. note:: Terminate the tuning earlier
+#
+#   You can terminate the tuning earlier by forcibly killing this process.
+#   As long as you get at least one valid schedule for each task in the log file,
+#   you should be able to do the compilation (the secion below).
+#
+
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+print("Compile...")
+with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+    lib = relay.build(mod, target=target, params=params)
+
+# Create graph executor
+dev = tvm.device(str(target), args.device_id)
+module = graph_executor.GraphModule(lib["default"](dev))
+data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+module.set_input("data", data_tvm)
+
+# Evaluate
+print("Evaluate inference time cost...")
+print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
+
+
+#################################################################
+# Other Tips
+# ----------
+# 1. During the tuning, the auto-scheduler needs to compile many programs and
+#    extract feature from them. This part is CPU-intensive,
+#    so a high-performance CPU with many cores is recommended for faster search.
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
+#    to distill the large log file and only save the best useful records.
+# 3. You can resume a search from the previous log file. You just need to
+#    add a new argument :code:`load_log_file` when creating the task scheduler
+#    in function :code:`run_tuning`. Say,
+#    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
+# 4. If you have multiple target GPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-scale-up-rpc-tracker>`
+#    to learn how to use the RPC Tracker and RPC Server.
+#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
+#    with :any:`auto_scheduler.RPCRunner`.
diff --git a/workspace/ablation_study_on_rule/print_best.py b/workspace/ablation_study_on_rule/print_best.py
new file mode 100644
index 000000000000..f1ac8dd6560c
--- /dev/null
+++ b/workspace/ablation_study_on_rule/print_best.py
@@ -0,0 +1,255 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_executor
+import argparse
+import os
+
+#################################################################
+# Parse arguments
+
+def parse_args():
+    parser = argparse.ArgumentParser("Evaluate tuned result")
+    parser.add_argument(
+        '-b',
+        '--batch_size',
+        type=int,
+        default=16,
+        help='batch size')
+    parser.add_argument(
+        '-d',
+        '--device_id',
+        type=int,
+        default=7,
+        help='device id to be used'
+    )
+    parser.add_argument(
+        '--tuned_dir',
+        default='./result',
+        help='dirname of tuned result stored'
+    )
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+print("Arguments: %s" % args)
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-50"
+batch_size = args.batch_size
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+dev = tvm.device(str(target), args.device_id)
+
+def debug_tuned_result(log_file):
+    for idx, task in enumerate(tasks):
+        print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+        print("Weight:%f" % task_weights[idx])
+        compute_dag = task.compute_dag
+        print("DAG------->")
+        print(compute_dag)
+        #sch, args = task.apply_best(log_file)
+        inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key)
+        if inp is None:
+            print("!!!Can't find tuned schedule, skip")
+            continue
+            #sch, tensors = compute_dag.apply_steps_from_state(compute_dag.get_init_state())
+        else:
+            sch, tensors = compute_dag.apply_steps_from_state(inp.state)
+        lowered_module = tvm.lower(sch, tensors, simple_mode=True)
+        print("TIR------->")
+        print(lowered_module)
+        print("TIR AST------->")
+        print(lowered_module.astext())
+        func = tvm.build(sch, tensors, target)
+        print("CUDA------->")
+        #print(task.print_best(log_file, print_mode="cuda"))
+        print(func.imported_modules[0].get_source())
+        input_data = []
+        for tensor in tensors:
+            shape = auto_scheduler.utils.get_const_tuple(tensor.shape)
+            xd = tvm.nd.array((np.random.uniform(size=shape)).astype(dtype), device=dev)
+            input_data.append(xd)
+        evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
+        print("Execution time of this task: %.3f ms" % (np.median(evaluator(*input_data).results) * 1000))
+
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+def apply_tuned(log_file):
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+            lib = relay.build(mod, target=target, params=params)
+
+    # Create graph executor
+    dev = tvm.device(str(target), args.device_id)
+    module = graph_executor.GraphModule(lib["default"](dev))
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input("data", data_tvm)
+
+    # Evaluate
+    print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
+
+if os.path.isdir(args.tuned_dir):
+    for root, dirs, files in os.walk(args.tuned_dir):
+        for file_name in files:
+            print("Apply file: %s" % log_file)
+            log_file = os.path.join(root, file_name)
+            debug_tuned_result(log_file)
+        #apply_tuned(log_file)
+else:
+    log_file = args.tuned_dir
+    print("Apply file: %s" % log_file)
+    debug_tuned_result(log_file)
+
diff --git a/workspace/ablation_study_on_rule/tune_network_cuda.py b/workspace/ablation_study_on_rule/tune_network_cuda.py
index 21d44b0b8a34..e7074a51739e 100644
--- a/workspace/ablation_study_on_rule/tune_network_cuda.py
+++ b/workspace/ablation_study_on_rule/tune_network_cuda.py
@@ -230,7 +230,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 
 def run_tuning():
     print("Begin tuning...")
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(n_parallel=2, repeat=1, min_repeat_ms=300, timeout=10, device=args.device_id)
 
     tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
     tune_option = auto_scheduler.TuningOptions(
diff --git a/workspace/compile.sh b/workspace/compile.sh
new file mode 100644
index 000000000000..82982219e3cb
--- /dev/null
+++ b/workspace/compile.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+tvmc compile --target "llvm -mcpu=core-avx2" --output resnet50-v2-7-tvm.tar resnet50-v2-7.onnx
diff --git a/workspace/default_resnet50.py b/workspace/default_resnet50.py
new file mode 100644
index 000000000000..8bdd1453ead8
--- /dev/null
+++ b/workspace/default_resnet50.py
@@ -0,0 +1,310 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_executor
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-50"
+batch_size = 1
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+#tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+#
+#for idx, task in enumerate(tasks):
+#    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+#    print(task.compute_dag)
+#
+#################################################################
+# Begin Tuning
+# ------------
+# Now, we set some options for tuning and launch the search tasks
+#
+# * :code:`measure_ctx` launches a different process for measurement to
+#   provide isolation. It can protect the main process from GPU crashes
+#   during measurement and avoid other runtime conflicts.
+# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
+#   This can warmup the GPU, which is necessary to get accurate measurement results.
+#   Typically, we recommend a value >= 300 ms.
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
+#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
+#   In practice, we recommend setting it around :code:`900 * len(tasks)`,
+#   which is typically enough for the search to converge.
+#   For example, there are 24 tasks in resnet-18, so we can set it as 20000.
+#   You can adjust this parameter according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
+#
+
+
+def run_tuning():
+    print("Begin tuning...")
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
+
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=48,  # change this to 20000 to achieve the best performance
+        runner=measure_ctx.runner,
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option)
+
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+#run_tuning()
+
+
+######################################################################
+# .. note:: Explain the printed information during tuning
+#
+#   During the tuning, a lot of information will be printed on the console.
+#   They are used for debugging purposes. The most important info is the output
+#   of the task scheduler. The following table is a sample output.
+#
+#   .. code-block:: c
+#
+#     ----------------------------------------------------------------------
+#     ------------------------------  [ Task Scheduler ]
+#     ----------------------------------------------------------------------
+#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#     -------------------------------------------------
+#     |    0 |        0.005 |           0.88 |     64 |
+#     |    1 |        0.010 |          99.10 |     64 |
+#     |    2 |        0.006 |           0.00 |     64 |
+#     |    3 |        0.145 |         979.78 |    384 |
+#     |    4 |        0.130 |        1097.02 |    384 |
+#     |    5 |        0.143 |         992.69 |    384 |
+#     |    6 |        0.076 |        1526.86 |    192 |
+#     |    7 |        0.115 |         999.44 |    320 |
+#     |    8 |        0.079 |        1449.39 |    320 |
+#     |    9 |        0.122 |         938.73 |    384 |
+#     |   10 |        0.063 |        1832.98 |    192 |
+#     |   11 |        0.072 |        1763.62 |    256 |
+#     |   12 |        0.062 |        2036.40 |    192 |
+#     |   13 |        0.068 |        1874.44 |    192 |
+#     |   14 |        0.049 |        2346.50 |    128 |
+#     |   15 |        0.076 |        1694.31 |    256 |
+#     |   16 |        0.067 |        1933.30 |    448 |
+#     |   17 |        0.076 |        1680.90 |    256 |
+#     |   18 |        0.022 |          98.43 |     64 |
+#     |   19 |        0.076 |        3112.55 |    192 |
+#     |   20 |        0.013 |        2026.44 |     64 |
+#     |   21 |        0.011 |        1136.69 |     64 |
+#     |   22 |        0.013 |         992.47 |     64 |
+#     |   23 |        0.020 |         627.56 |     64 |
+#     -------------------------------------------------
+#     Estimated total latency: 1.587 ms  Trials: 4992  Used time : 13296 s  Next ID: 3
+#
+#   This table lists the latency and (estimated) speed of all tasks.
+#   It also lists the allocation of measurement trials for all tasks.
+#   The last line prints the total weighted latency of these tasks,
+#   which can be a rough estimation of the end-to-end execution time
+#   of the network.
+#   The last line also prints the total number of measurement trials,
+#   total time spent on auto-tuning and the id of the next task to tune.
+#
+#   There will also be some "tvm::Error"s and CUDA errors, because the
+#   auto-scheduler will try some invalid schedules.
+#   You can safely ignore them if the tuning can continue, because these
+#   errors are isolated from the main process.
+#
+
+######################################################################
+# .. note:: Terminate the tuning earlier
+#
+#   You can terminate the tuning earlier by forcibly killing this process.
+#   As long as you get at least one valid schedule for each task in the log file,
+#   you should be able to do the compilation (the secion below).
+#
+
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+#print("Compile...")
+#with auto_scheduler.ApplyHistoryBest(log_file):
+#    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+lib = relay.build(mod, target=target, params=params)
+
+# Create graph executor
+dev = tvm.device(str(target), 7)
+module = graph_executor.GraphModule(lib["default"](dev))
+data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+module.set_input("data", data_tvm)
+
+# Evaluate
+print("Evaluate inference time cost...")
+print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
+
+
+#################################################################
+# Other Tips
+# ----------
+# 1. During the tuning, the auto-scheduler needs to compile many programs and
+#    extract feature from them. This part is CPU-intensive,
+#    so a high-performance CPU with many cores is recommended for faster search.
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
+#    to distill the large log file and only save the best useful records.
+# 3. You can resume a search from the previous log file. You just need to
+#    add a new argument :code:`load_log_file` when creating the task scheduler
+#    in function :code:`run_tuning`. Say,
+#    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
+# 4. If you have multiple target GPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-scale-up-rpc-tracker>`
+#    to learn how to use the RPC Tracker and RPC Server.
+#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
+#    with :any:`auto_scheduler.RPCRunner`.
diff --git a/workspace/tune_network_cuda.py b/workspace/tune_network_cuda.py
new file mode 100644
index 000000000000..4a28a2ef9968
--- /dev/null
+++ b/workspace/tune_network_cuda.py
@@ -0,0 +1,310 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_executor
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-50"
+batch_size = 1
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+for idx, task in enumerate(tasks):
+    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+    print(task.compute_dag)
+
+#################################################################
+# Begin Tuning
+# ------------
+# Now, we set some options for tuning and launch the search tasks
+#
+# * :code:`measure_ctx` launches a different process for measurement to
+#   provide isolation. It can protect the main process from GPU crashes
+#   during measurement and avoid other runtime conflicts.
+# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
+#   This can warmup the GPU, which is necessary to get accurate measurement results.
+#   Typically, we recommend a value >= 300 ms.
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
+#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
+#   In practice, we recommend setting it around :code:`900 * len(tasks)`,
+#   which is typically enough for the search to converge.
+#   For example, there are 24 tasks in resnet-18, so we can set it as 20000.
+#   You can adjust this parameter according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
+#
+
+
+def run_tuning():
+    print("Begin tuning...")
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
+
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=300,  # change this to 20000 to achieve the best performance
+        runner=measure_ctx.runner,
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option)
+
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+run_tuning()
+
+
+######################################################################
+# .. note:: Explain the printed information during tuning
+#
+#   During the tuning, a lot of information will be printed on the console.
+#   They are used for debugging purposes. The most important info is the output
+#   of the task scheduler. The following table is a sample output.
+#
+#   .. code-block:: c
+#
+#     ----------------------------------------------------------------------
+#     ------------------------------  [ Task Scheduler ]
+#     ----------------------------------------------------------------------
+#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#     -------------------------------------------------
+#     |    0 |        0.005 |           0.88 |     64 |
+#     |    1 |        0.010 |          99.10 |     64 |
+#     |    2 |        0.006 |           0.00 |     64 |
+#     |    3 |        0.145 |         979.78 |    384 |
+#     |    4 |        0.130 |        1097.02 |    384 |
+#     |    5 |        0.143 |         992.69 |    384 |
+#     |    6 |        0.076 |        1526.86 |    192 |
+#     |    7 |        0.115 |         999.44 |    320 |
+#     |    8 |        0.079 |        1449.39 |    320 |
+#     |    9 |        0.122 |         938.73 |    384 |
+#     |   10 |        0.063 |        1832.98 |    192 |
+#     |   11 |        0.072 |        1763.62 |    256 |
+#     |   12 |        0.062 |        2036.40 |    192 |
+#     |   13 |        0.068 |        1874.44 |    192 |
+#     |   14 |        0.049 |        2346.50 |    128 |
+#     |   15 |        0.076 |        1694.31 |    256 |
+#     |   16 |        0.067 |        1933.30 |    448 |
+#     |   17 |        0.076 |        1680.90 |    256 |
+#     |   18 |        0.022 |          98.43 |     64 |
+#     |   19 |        0.076 |        3112.55 |    192 |
+#     |   20 |        0.013 |        2026.44 |     64 |
+#     |   21 |        0.011 |        1136.69 |     64 |
+#     |   22 |        0.013 |         992.47 |     64 |
+#     |   23 |        0.020 |         627.56 |     64 |
+#     -------------------------------------------------
+#     Estimated total latency: 1.587 ms  Trials: 4992  Used time : 13296 s  Next ID: 3
+#
+#   This table lists the latency and (estimated) speed of all tasks.
+#   It also lists the allocation of measurement trials for all tasks.
+#   The last line prints the total weighted latency of these tasks,
+#   which can be a rough estimation of the end-to-end execution time
+#   of the network.
+#   The last line also prints the total number of measurement trials,
+#   total time spent on auto-tuning and the id of the next task to tune.
+#
+#   There will also be some "tvm::Error"s and CUDA errors, because the
+#   auto-scheduler will try some invalid schedules.
+#   You can safely ignore them if the tuning can continue, because these
+#   errors are isolated from the main process.
+#
+
+######################################################################
+# .. note:: Terminate the tuning earlier
+#
+#   You can terminate the tuning earlier by forcibly killing this process.
+#   As long as you get at least one valid schedule for each task in the log file,
+#   you should be able to do the compilation (the secion below).
+#
+
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+print("Compile...")
+with auto_scheduler.ApplyHistoryBest(log_file):
+    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+        lib = relay.build(mod, target=target, params=params)
+
+# Create graph executor
+dev = tvm.device(str(target), 7)
+module = graph_executor.GraphModule(lib["default"](dev))
+data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+module.set_input("data", data_tvm)
+
+# Evaluate
+print("Evaluate inference time cost...")
+print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
+
+
+#################################################################
+# Other Tips
+# ----------
+# 1. During the tuning, the auto-scheduler needs to compile many programs and
+#    extract feature from them. This part is CPU-intensive,
+#    so a high-performance CPU with many cores is recommended for faster search.
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
+#    to distill the large log file and only save the best useful records.
+# 3. You can resume a search from the previous log file. You just need to
+#    add a new argument :code:`load_log_file` when creating the task scheduler
+#    in function :code:`run_tuning`. Say,
+#    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
+# 4. If you have multiple target GPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-scale-up-rpc-tracker>`
+#    to learn how to use the RPC Tracker and RPC Server.
+#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
+#    with :any:`auto_scheduler.RPCRunner`.

From 7c3070a64981a79692d867bf6eed9c6ecb760ea9 Mon Sep 17 00:00:00 2001
From: CtfGo <ctfeng66@163.com>
Date: Wed, 13 Jul 2022 03:35:14 +0000
Subject: [PATCH 3/4] update

---
 3rdparty/cutlass                              |   2 +-
 3rdparty/rang                                 |   2 +-
 .../ablate.sh                                 |   0
 .../apply_tuned.py                            |   0
 .../evaluate.sh                               |   0
 .../print_best.py                             |   0
 .../tune_network_cuda.py                      |   0
 .../ablation_study_on_rule/no_schedule.py     | 354 ------------------
 8 files changed, 2 insertions(+), 356 deletions(-)
 rename workspace/{ablation_study_on_rule => ablate_sketch_rule}/ablate.sh (100%)
 rename workspace/{ablation_study_on_rule => ablate_sketch_rule}/apply_tuned.py (100%)
 rename workspace/{ablation_study_on_rule => ablate_sketch_rule}/evaluate.sh (100%)
 rename workspace/{ablation_study_on_rule => ablate_sketch_rule}/print_best.py (100%)
 rename workspace/{ablation_study_on_rule => ablate_sketch_rule}/tune_network_cuda.py (100%)
 delete mode 100644 workspace/ablation_study_on_rule/no_schedule.py

diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index c2ee13a0fe99..8a766804ad6f 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit c2ee13a0fe99241b0e798ce647acf98e237f1d0c
+Subproject commit 8a766804ad6ff14b1164fe922c6fe54c131bb02b
diff --git a/3rdparty/rang b/3rdparty/rang
index cabe04d6d6b0..22345aa4c468 160000
--- a/3rdparty/rang
+++ b/3rdparty/rang
@@ -1 +1 @@
-Subproject commit cabe04d6d6b05356fa8f9741704924788f0dd762
+Subproject commit 22345aa4c468db3bd4a0e64a47722aad3518cc81
diff --git a/workspace/ablation_study_on_rule/ablate.sh b/workspace/ablate_sketch_rule/ablate.sh
similarity index 100%
rename from workspace/ablation_study_on_rule/ablate.sh
rename to workspace/ablate_sketch_rule/ablate.sh
diff --git a/workspace/ablation_study_on_rule/apply_tuned.py b/workspace/ablate_sketch_rule/apply_tuned.py
similarity index 100%
rename from workspace/ablation_study_on_rule/apply_tuned.py
rename to workspace/ablate_sketch_rule/apply_tuned.py
diff --git a/workspace/ablation_study_on_rule/evaluate.sh b/workspace/ablate_sketch_rule/evaluate.sh
similarity index 100%
rename from workspace/ablation_study_on_rule/evaluate.sh
rename to workspace/ablate_sketch_rule/evaluate.sh
diff --git a/workspace/ablation_study_on_rule/print_best.py b/workspace/ablate_sketch_rule/print_best.py
similarity index 100%
rename from workspace/ablation_study_on_rule/print_best.py
rename to workspace/ablate_sketch_rule/print_best.py
diff --git a/workspace/ablation_study_on_rule/tune_network_cuda.py b/workspace/ablate_sketch_rule/tune_network_cuda.py
similarity index 100%
rename from workspace/ablation_study_on_rule/tune_network_cuda.py
rename to workspace/ablate_sketch_rule/tune_network_cuda.py
diff --git a/workspace/ablation_study_on_rule/no_schedule.py b/workspace/ablation_study_on_rule/no_schedule.py
deleted file mode 100644
index 2007b863e367..000000000000
--- a/workspace/ablation_study_on_rule/no_schedule.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Auto-scheduling a Neural Network for NVIDIA GPU
-===============================================
-**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
-
-Auto-tuning for specific devices and workloads is critical for getting the
-best performance. This is a tutorial on how to tune a whole neural
-network for NVIDIA GPU with the auto-scheduler.
-
-To auto-tune a neural network, we partition the network into small subgraphs and
-tune them independently. Each subgraph is treated as one search task.
-A task scheduler slices the time and dynamically allocates time resources to
-these tasks. The task scheduler predicts the impact of each task on the end-to-end
-execution time and prioritizes the one that can reduce the execution time the most.
-
-For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
-get the computational DAG in the tensor expression form.
-We then use the auto-scheduler to construct a search space of this DAG and search
-for good schedules (low-level optimizations).
-
-Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
-manual templates to define the search space, the auto-scheduler does not require any
-schedule templates. In other words, the auto-scheduler only uses the compute declarations
-in :code:`tvm/python/topi` and does not use existing schedule templates.
-
-Note that this tutorial will not run on Windows or recent versions of macOS. To
-get it to run, you will need to wrap the body of this tutorial in a :code:`if
-__name__ == "__main__":` block.
-"""
-
-import numpy as np
-
-import tvm
-from tvm import relay, auto_scheduler
-import tvm.relay.testing
-from tvm.contrib import graph_executor
-import argparse
-import os
-
-#################################################################
-# Parse arguments
-
-def parse_args():
-    parser = argparse.ArgumentParser("Tuning arguments")
-    parser.add_argument(
-        '-b',
-        '--batch_size',
-        type=int,
-        default=16,
-        help='batch size')
-    parser.add_argument(
-        '-d',
-        '--device_id',
-        type=int,
-        default=7,
-        help='device id to be used'
-    )
-    parser.add_argument(
-        '-n',
-        '--num_measure_trials',
-        type=int,
-        default=300,
-        help='number of trials to be measured'
-    )
-    parser.add_argument(
-        '--tuned_dir',
-        default='./result',
-        help='dirname of tuned result stored'
-    )
-    parser.add_argument(
-        '-e',
-        '--ablated_rules',
-        action='append',
-        default=[],
-        help='names of rules to be ablated')
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-print("Arguments: %s" % args)
-
-#################################################################
-# Define a Network
-# ----------------
-# First, we need to define the network with relay frontend API.
-# We can load some pre-defined network from :code:`tvm.relay.testing`.
-# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
-# (see :ref:`front end tutorials<tutorial-frontend>`).
-#
-# For convolutional neural networks, although auto-scheduler can work correctly
-# with any layout, we found the best performance is typically achieved with NHWC layout.
-# We also implemented more optimizations for NHWC layout with the auto-scheduler.
-# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
-# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
-
-
-def get_network(name, batch_size, layout="NHWC", dtype="float32"):
-    """Get the symbol definition and random weight of a network"""
-
-    # auto-scheduler prefers NHWC layout
-    if layout == "NHWC":
-        image_shape = (224, 224, 3)
-    elif layout == "NCHW":
-        image_shape = (3, 224, 224)
-    else:
-        raise ValueError("Invalid layout: " + layout)
-
-    input_shape = (batch_size,) + image_shape
-    output_shape = (batch_size, 1000)
-
-    if name.startswith("resnet-"):
-        n_layer = int(name.split("-")[1])
-        mod, params = relay.testing.resnet.get_workload(
-            num_layers=n_layer,
-            batch_size=batch_size,
-            layout=layout,
-            dtype=dtype,
-            image_shape=image_shape,
-        )
-    elif name.startswith("resnet3d-"):
-        n_layer = int(name.split("-")[1])
-        mod, params = relay.testing.resnet.get_workload(
-            num_layers=n_layer,
-            batch_size=batch_size,
-            layout=layout,
-            dtype=dtype,
-            image_shape=image_shape,
-        )
-    elif name == "mobilenet":
-        mod, params = relay.testing.mobilenet.get_workload(
-            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
-        )
-    elif name == "squeezenet_v1.1":
-        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
-        mod, params = relay.testing.squeezenet.get_workload(
-            version="1.1",
-            batch_size=batch_size,
-            dtype=dtype,
-            image_shape=image_shape,
-        )
-    elif name == "inception_v3":
-        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
-        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == "mxnet":
-        # an example for mxnet model
-        from mxnet.gluon.model_zoo.vision import get_model
-
-        assert layout == "NCHW"
-
-        block = get_model("resnet18_v1", pretrained=True)
-        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
-        net = mod["main"]
-        net = relay.Function(
-            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
-        )
-        mod = tvm.IRModule.from_expr(net)
-
-    return mod, params, input_shape, output_shape
-
-
-# Define the neural network and compilation target
-network = "resnet-50"
-batch_size = args.batch_size
-layout = "NHWC"
-target = tvm.target.Target("cuda")
-dtype = "float32"
-log_name = "%s-%s-B%d-%s.disable-%s.json" % (network, layout, batch_size, target.kind.name, '_'.join(args.ablated_rules))
-log_file = os.path.join(args.tuned_dir, log_name)
-
-#################################################################
-# Extract Search Tasks
-# --------------------
-# Next, we extract the search tasks and their weights from a network.
-# The weight of a task is the number of appearances of the task's subgraph
-# in the whole network.
-# By using the weight, we can approximate the end-to-end latency of the network
-# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
-# latency of a task and :code:`weight[t]` is the weight of the task.
-# The task scheduler will just optimize this objective.
-
-# Extract tasks from the network
-print("Extract tasks...")
-mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
-#tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
-#
-#for idx, task in enumerate(tasks):
-#    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
-#    print(task.compute_dag)
-
-#################################################################
-# Begin Tuning
-# ------------
-# Now, we set some options for tuning and launch the search tasks
-#
-# * :code:`measure_ctx` launches a different process for measurement to
-#   provide isolation. It can protect the main process from GPU crashes
-#   during measurement and avoid other runtime conflicts.
-# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
-#   This can warmup the GPU, which is necessary to get accurate measurement results.
-#   Typically, we recommend a value >= 300 ms.
-# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
-#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
-#   In practice, we recommend setting it around :code:`900 * len(tasks)`,
-#   which is typically enough for the search to converge.
-#   For example, there are 24 tasks in resnet-18, so we can set it as 20000.
-#   You can adjust this parameter according to your time budget.
-# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
-#   The measurement records can be used to query the history best, resume the search,
-#   and do more analyses later.
-# * see :any:`auto_scheduler.TuningOptions`,
-#   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
-#
-
-
-def run_tuning():
-    print("Begin tuning...")
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext(n_parallel=2, repeat=1, min_repeat_ms=300, timeout=10, device=args.device_id)
-
-    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-    tune_option = auto_scheduler.TuningOptions(
-        num_measure_trials=args.num_measure_trials,  # change this to 20000 to achieve the best performance
-        runner=measure_ctx.runner,
-        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-    )
-
-    tuner.tune(tune_option, search_policy_params={'ablated_rule_names' : args.ablated_rules})
-
-
-# We do not run the tuning in our webpage server since it takes too long.
-# Uncomment the following line to run it by yourself.
-
-#run_tuning()
-
-
-######################################################################
-# .. note:: Explain the printed information during tuning
-#
-#   During the tuning, a lot of information will be printed on the console.
-#   They are used for debugging purposes. The most important info is the output
-#   of the task scheduler. The following table is a sample output.
-#
-#   .. code-block:: c
-#
-#     ----------------------------------------------------------------------
-#     ------------------------------  [ Task Scheduler ]
-#     ----------------------------------------------------------------------
-#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-#     -------------------------------------------------
-#     |    0 |        0.005 |           0.88 |     64 |
-#     |    1 |        0.010 |          99.10 |     64 |
-#     |    2 |        0.006 |           0.00 |     64 |
-#     |    3 |        0.145 |         979.78 |    384 |
-#     |    4 |        0.130 |        1097.02 |    384 |
-#     |    5 |        0.143 |         992.69 |    384 |
-#     |    6 |        0.076 |        1526.86 |    192 |
-#     |    7 |        0.115 |         999.44 |    320 |
-#     |    8 |        0.079 |        1449.39 |    320 |
-#     |    9 |        0.122 |         938.73 |    384 |
-#     |   10 |        0.063 |        1832.98 |    192 |
-#     |   11 |        0.072 |        1763.62 |    256 |
-#     |   12 |        0.062 |        2036.40 |    192 |
-#     |   13 |        0.068 |        1874.44 |    192 |
-#     |   14 |        0.049 |        2346.50 |    128 |
-#     |   15 |        0.076 |        1694.31 |    256 |
-#     |   16 |        0.067 |        1933.30 |    448 |
-#     |   17 |        0.076 |        1680.90 |    256 |
-#     |   18 |        0.022 |          98.43 |     64 |
-#     |   19 |        0.076 |        3112.55 |    192 |
-#     |   20 |        0.013 |        2026.44 |     64 |
-#     |   21 |        0.011 |        1136.69 |     64 |
-#     |   22 |        0.013 |         992.47 |     64 |
-#     |   23 |        0.020 |         627.56 |     64 |
-#     -------------------------------------------------
-#     Estimated total latency: 1.587 ms  Trials: 4992  Used time : 13296 s  Next ID: 3
-#
-#   This table lists the latency and (estimated) speed of all tasks.
-#   It also lists the allocation of measurement trials for all tasks.
-#   The last line prints the total weighted latency of these tasks,
-#   which can be a rough estimation of the end-to-end execution time
-#   of the network.
-#   The last line also prints the total number of measurement trials,
-#   total time spent on auto-tuning and the id of the next task to tune.
-#
-#   There will also be some "tvm::Error"s and CUDA errors, because the
-#   auto-scheduler will try some invalid schedules.
-#   You can safely ignore them if the tuning can continue, because these
-#   errors are isolated from the main process.
-#
-
-######################################################################
-# .. note:: Terminate the tuning earlier
-#
-#   You can terminate the tuning earlier by forcibly killing this process.
-#   As long as you get at least one valid schedule for each task in the log file,
-#   you should be able to do the compilation (the secion below).
-#
-
-
-#################################################################
-# Compile and Evaluate
-# --------------------
-# After auto-tuning, we can compile the network with the best schedules we found.
-# All measurement records are dumped into the log file during auto-tuning,
-# so we can read the log file and load the best schedules.
-
-# Compile with the history best
-print("Compile...")
-with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
-    lib = relay.build(mod, target=target, params=params)
-
-# Create graph executor
-dev = tvm.device(str(target), args.device_id)
-module = graph_executor.GraphModule(lib["default"](dev))
-data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-module.set_input("data", data_tvm)
-
-# Evaluate
-print("Evaluate inference time cost...")
-print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
-
-
-#################################################################
-# Other Tips
-# ----------
-# 1. During the tuning, the auto-scheduler needs to compile many programs and
-#    extract feature from them. This part is CPU-intensive,
-#    so a high-performance CPU with many cores is recommended for faster search.
-# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
-#    to distill the large log file and only save the best useful records.
-# 3. You can resume a search from the previous log file. You just need to
-#    add a new argument :code:`load_log_file` when creating the task scheduler
-#    in function :code:`run_tuning`. Say,
-#    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
-# 4. If you have multiple target GPUs, you can use all of them for measurements to
-#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-scale-up-rpc-tracker>`
-#    to learn how to use the RPC Tracker and RPC Server.
-#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
-#    with :any:`auto_scheduler.RPCRunner`.

From 4638ccfac9eea47f2426eb06aec8b502e484c91e Mon Sep 17 00:00:00 2001
From: CtfGo <ctfeng66@163.com>
Date: Wed, 13 Jul 2022 04:59:22 +0000
Subject: [PATCH 4/4] remove redudant

---
 3rdparty/dmlc-core             |   2 +-
 workspace/compile.sh           |   3 -
 workspace/default_resnet50.py  | 310 ---------------------------------
 workspace/tune_network_cuda.py | 310 ---------------------------------
 4 files changed, 1 insertion(+), 624 deletions(-)
 delete mode 100644 workspace/compile.sh
 delete mode 100644 workspace/default_resnet50.py
 delete mode 100644 workspace/tune_network_cuda.py

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 09511cf9fe5f..21cc7de0dc9f 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 09511cf9fe5ff103900a5eafb50870dc84cc17c8
+Subproject commit 21cc7de0dc9fd6acb796e1be6181fa8e6b6c8f41
diff --git a/workspace/compile.sh b/workspace/compile.sh
deleted file mode 100644
index 82982219e3cb..000000000000
--- a/workspace/compile.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-tvmc compile --target "llvm -mcpu=core-avx2" --output resnet50-v2-7-tvm.tar resnet50-v2-7.onnx
diff --git a/workspace/default_resnet50.py b/workspace/default_resnet50.py
deleted file mode 100644
index 8bdd1453ead8..000000000000
--- a/workspace/default_resnet50.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Auto-scheduling a Neural Network for NVIDIA GPU
-===============================================
-**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
-
-Auto-tuning for specific devices and workloads is critical for getting the
-best performance. This is a tutorial on how to tune a whole neural
-network for NVIDIA GPU with the auto-scheduler.
-
-To auto-tune a neural network, we partition the network into small subgraphs and
-tune them independently. Each subgraph is treated as one search task.
-A task scheduler slices the time and dynamically allocates time resources to
-these tasks. The task scheduler predicts the impact of each task on the end-to-end
-execution time and prioritizes the one that can reduce the execution time the most.
-
-For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
-get the computational DAG in the tensor expression form.
-We then use the auto-scheduler to construct a search space of this DAG and search
-for good schedules (low-level optimizations).
-
-Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
-manual templates to define the search space, the auto-scheduler does not require any
-schedule templates. In other words, the auto-scheduler only uses the compute declarations
-in :code:`tvm/python/topi` and does not use existing schedule templates.
-
-Note that this tutorial will not run on Windows or recent versions of macOS. To
-get it to run, you will need to wrap the body of this tutorial in a :code:`if
-__name__ == "__main__":` block.
-"""
-
-import numpy as np
-
-import tvm
-from tvm import relay, auto_scheduler
-import tvm.relay.testing
-from tvm.contrib import graph_executor
-
-#################################################################
-# Define a Network
-# ----------------
-# First, we need to define the network with relay frontend API.
-# We can load some pre-defined network from :code:`tvm.relay.testing`.
-# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
-# (see :ref:`front end tutorials<tutorial-frontend>`).
-#
-# For convolutional neural networks, although auto-scheduler can work correctly
-# with any layout, we found the best performance is typically achieved with NHWC layout.
-# We also implemented more optimizations for NHWC layout with the auto-scheduler.
-# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
-# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
-
-
-def get_network(name, batch_size, layout="NHWC", dtype="float32"):
-    """Get the symbol definition and random weight of a network"""
-
-    # auto-scheduler prefers NHWC layout
-    if layout == "NHWC":
-        image_shape = (224, 224, 3)
-    elif layout == "NCHW":
-        image_shape = (3, 224, 224)
-    else:
-        raise ValueError("Invalid layout: " + layout)
-
-    input_shape = (batch_size,) + image_shape
-    output_shape = (batch_size, 1000)
-
-    if name.startswith("resnet-"):
-        n_layer = int(name.split("-")[1])
-        mod, params = relay.testing.resnet.get_workload(
-            num_layers=n_layer,
-            batch_size=batch_size,
-            layout=layout,
-            dtype=dtype,
-            image_shape=image_shape,
-        )
-    elif name.startswith("resnet3d-"):
-        n_layer = int(name.split("-")[1])
-        mod, params = relay.testing.resnet.get_workload(
-            num_layers=n_layer,
-            batch_size=batch_size,
-            layout=layout,
-            dtype=dtype,
-            image_shape=image_shape,
-        )
-    elif name == "mobilenet":
-        mod, params = relay.testing.mobilenet.get_workload(
-            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
-        )
-    elif name == "squeezenet_v1.1":
-        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
-        mod, params = relay.testing.squeezenet.get_workload(
-            version="1.1",
-            batch_size=batch_size,
-            dtype=dtype,
-            image_shape=image_shape,
-        )
-    elif name == "inception_v3":
-        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
-        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == "mxnet":
-        # an example for mxnet model
-        from mxnet.gluon.model_zoo.vision import get_model
-
-        assert layout == "NCHW"
-
-        block = get_model("resnet18_v1", pretrained=True)
-        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
-        net = mod["main"]
-        net = relay.Function(
-            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
-        )
-        mod = tvm.IRModule.from_expr(net)
-
-    return mod, params, input_shape, output_shape
-
-
-# Define the neural network and compilation target
-network = "resnet-50"
-batch_size = 1
-layout = "NHWC"
-target = tvm.target.Target("cuda")
-dtype = "float32"
-log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
-
-#################################################################
-# Extract Search Tasks
-# --------------------
-# Next, we extract the search tasks and their weights from a network.
-# The weight of a task is the number of appearances of the task's subgraph
-# in the whole network.
-# By using the weight, we can approximate the end-to-end latency of the network
-# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
-# latency of a task and :code:`weight[t]` is the weight of the task.
-# The task scheduler will just optimize this objective.
-
-# Extract tasks from the network
-print("Extract tasks...")
-mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
-#tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
-#
-#for idx, task in enumerate(tasks):
-#    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
-#    print(task.compute_dag)
-#
-#################################################################
-# Begin Tuning
-# ------------
-# Now, we set some options for tuning and launch the search tasks
-#
-# * :code:`measure_ctx` launches a different process for measurement to
-#   provide isolation. It can protect the main process from GPU crashes
-#   during measurement and avoid other runtime conflicts.
-# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
-#   This can warmup the GPU, which is necessary to get accurate measurement results.
-#   Typically, we recommend a value >= 300 ms.
-# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
-#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
-#   In practice, we recommend setting it around :code:`900 * len(tasks)`,
-#   which is typically enough for the search to converge.
-#   For example, there are 24 tasks in resnet-18, so we can set it as 20000.
-#   You can adjust this parameter according to your time budget.
-# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
-#   The measurement records can be used to query the history best, resume the search,
-#   and do more analyses later.
-# * see :any:`auto_scheduler.TuningOptions`,
-#   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
-#
-
-
-def run_tuning():
-    print("Begin tuning...")
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
-
-    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-    tune_option = auto_scheduler.TuningOptions(
-        num_measure_trials=48,  # change this to 20000 to achieve the best performance
-        runner=measure_ctx.runner,
-        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-    )
-
-    tuner.tune(tune_option)
-
-
-# We do not run the tuning in our webpage server since it takes too long.
-# Uncomment the following line to run it by yourself.
-
-#run_tuning()
-
-
-######################################################################
-# .. note:: Explain the printed information during tuning
-#
-#   During the tuning, a lot of information will be printed on the console.
-#   They are used for debugging purposes. The most important info is the output
-#   of the task scheduler. The following table is a sample output.
-#
-#   .. code-block:: c
-#
-#     ----------------------------------------------------------------------
-#     ------------------------------  [ Task Scheduler ]
-#     ----------------------------------------------------------------------
-#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-#     -------------------------------------------------
-#     |    0 |        0.005 |           0.88 |     64 |
-#     |    1 |        0.010 |          99.10 |     64 |
-#     |    2 |        0.006 |           0.00 |     64 |
-#     |    3 |        0.145 |         979.78 |    384 |
-#     |    4 |        0.130 |        1097.02 |    384 |
-#     |    5 |        0.143 |         992.69 |    384 |
-#     |    6 |        0.076 |        1526.86 |    192 |
-#     |    7 |        0.115 |         999.44 |    320 |
-#     |    8 |        0.079 |        1449.39 |    320 |
-#     |    9 |        0.122 |         938.73 |    384 |
-#     |   10 |        0.063 |        1832.98 |    192 |
-#     |   11 |        0.072 |        1763.62 |    256 |
-#     |   12 |        0.062 |        2036.40 |    192 |
-#     |   13 |        0.068 |        1874.44 |    192 |
-#     |   14 |        0.049 |        2346.50 |    128 |
-#     |   15 |        0.076 |        1694.31 |    256 |
-#     |   16 |        0.067 |        1933.30 |    448 |
-#     |   17 |        0.076 |        1680.90 |    256 |
-#     |   18 |        0.022 |          98.43 |     64 |
-#     |   19 |        0.076 |        3112.55 |    192 |
-#     |   20 |        0.013 |        2026.44 |     64 |
-#     |   21 |        0.011 |        1136.69 |     64 |
-#     |   22 |        0.013 |         992.47 |     64 |
-#     |   23 |        0.020 |         627.56 |     64 |
-#     -------------------------------------------------
-#     Estimated total latency: 1.587 ms  Trials: 4992  Used time : 13296 s  Next ID: 3
-#
-#   This table lists the latency and (estimated) speed of all tasks.
-#   It also lists the allocation of measurement trials for all tasks.
-#   The last line prints the total weighted latency of these tasks,
-#   which can be a rough estimation of the end-to-end execution time
-#   of the network.
-#   The last line also prints the total number of measurement trials,
-#   total time spent on auto-tuning and the id of the next task to tune.
-#
-#   There will also be some "tvm::Error"s and CUDA errors, because the
-#   auto-scheduler will try some invalid schedules.
-#   You can safely ignore them if the tuning can continue, because these
-#   errors are isolated from the main process.
-#
-
-######################################################################
-# .. note:: Terminate the tuning earlier
-#
-#   You can terminate the tuning earlier by forcibly killing this process.
-#   As long as you get at least one valid schedule for each task in the log file,
-#   you should be able to do the compilation (the secion below).
-#
-
-
-#################################################################
-# Compile and Evaluate
-# --------------------
-# After auto-tuning, we can compile the network with the best schedules we found.
-# All measurement records are dumped into the log file during auto-tuning,
-# so we can read the log file and load the best schedules.
-
-# Compile with the history best
-#print("Compile...")
-#with auto_scheduler.ApplyHistoryBest(log_file):
-#    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
-lib = relay.build(mod, target=target, params=params)
-
-# Create graph executor
-dev = tvm.device(str(target), 7)
-module = graph_executor.GraphModule(lib["default"](dev))
-data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-module.set_input("data", data_tvm)
-
-# Evaluate
-print("Evaluate inference time cost...")
-print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
-
-
-#################################################################
-# Other Tips
-# ----------
-# 1. During the tuning, the auto-scheduler needs to compile many programs and
-#    extract feature from them. This part is CPU-intensive,
-#    so a high-performance CPU with many cores is recommended for faster search.
-# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
-#    to distill the large log file and only save the best useful records.
-# 3. You can resume a search from the previous log file. You just need to
-#    add a new argument :code:`load_log_file` when creating the task scheduler
-#    in function :code:`run_tuning`. Say,
-#    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
-# 4. If you have multiple target GPUs, you can use all of them for measurements to
-#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-scale-up-rpc-tracker>`
-#    to learn how to use the RPC Tracker and RPC Server.
-#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
-#    with :any:`auto_scheduler.RPCRunner`.
diff --git a/workspace/tune_network_cuda.py b/workspace/tune_network_cuda.py
deleted file mode 100644
index 4a28a2ef9968..000000000000
--- a/workspace/tune_network_cuda.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Auto-scheduling a Neural Network for NVIDIA GPU
-===============================================
-**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
-
-Auto-tuning for specific devices and workloads is critical for getting the
-best performance. This is a tutorial on how to tune a whole neural
-network for NVIDIA GPU with the auto-scheduler.
-
-To auto-tune a neural network, we partition the network into small subgraphs and
-tune them independently. Each subgraph is treated as one search task.
-A task scheduler slices the time and dynamically allocates time resources to
-these tasks. The task scheduler predicts the impact of each task on the end-to-end
-execution time and prioritizes the one that can reduce the execution time the most.
-
-For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
-get the computational DAG in the tensor expression form.
-We then use the auto-scheduler to construct a search space of this DAG and search
-for good schedules (low-level optimizations).
-
-Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
-manual templates to define the search space, the auto-scheduler does not require any
-schedule templates. In other words, the auto-scheduler only uses the compute declarations
-in :code:`tvm/python/topi` and does not use existing schedule templates.
-
-Note that this tutorial will not run on Windows or recent versions of macOS. To
-get it to run, you will need to wrap the body of this tutorial in a :code:`if
-__name__ == "__main__":` block.
-"""
-
-import numpy as np
-
-import tvm
-from tvm import relay, auto_scheduler
-import tvm.relay.testing
-from tvm.contrib import graph_executor
-
-#################################################################
-# Define a Network
-# ----------------
-# First, we need to define the network with relay frontend API.
-# We can load some pre-defined network from :code:`tvm.relay.testing`.
-# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
-# (see :ref:`front end tutorials<tutorial-frontend>`).
-#
-# For convolutional neural networks, although auto-scheduler can work correctly
-# with any layout, we found the best performance is typically achieved with NHWC layout.
-# We also implemented more optimizations for NHWC layout with the auto-scheduler.
-# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
-# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
-
-
-def get_network(name, batch_size, layout="NHWC", dtype="float32"):
-    """Get the symbol definition and random weight of a network"""
-
-    # auto-scheduler prefers NHWC layout
-    if layout == "NHWC":
-        image_shape = (224, 224, 3)
-    elif layout == "NCHW":
-        image_shape = (3, 224, 224)
-    else:
-        raise ValueError("Invalid layout: " + layout)
-
-    input_shape = (batch_size,) + image_shape
-    output_shape = (batch_size, 1000)
-
-    if name.startswith("resnet-"):
-        n_layer = int(name.split("-")[1])
-        mod, params = relay.testing.resnet.get_workload(
-            num_layers=n_layer,
-            batch_size=batch_size,
-            layout=layout,
-            dtype=dtype,
-            image_shape=image_shape,
-        )
-    elif name.startswith("resnet3d-"):
-        n_layer = int(name.split("-")[1])
-        mod, params = relay.testing.resnet.get_workload(
-            num_layers=n_layer,
-            batch_size=batch_size,
-            layout=layout,
-            dtype=dtype,
-            image_shape=image_shape,
-        )
-    elif name == "mobilenet":
-        mod, params = relay.testing.mobilenet.get_workload(
-            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
-        )
-    elif name == "squeezenet_v1.1":
-        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
-        mod, params = relay.testing.squeezenet.get_workload(
-            version="1.1",
-            batch_size=batch_size,
-            dtype=dtype,
-            image_shape=image_shape,
-        )
-    elif name == "inception_v3":
-        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
-        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == "mxnet":
-        # an example for mxnet model
-        from mxnet.gluon.model_zoo.vision import get_model
-
-        assert layout == "NCHW"
-
-        block = get_model("resnet18_v1", pretrained=True)
-        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
-        net = mod["main"]
-        net = relay.Function(
-            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
-        )
-        mod = tvm.IRModule.from_expr(net)
-
-    return mod, params, input_shape, output_shape
-
-
-# Define the neural network and compilation target
-network = "resnet-50"
-batch_size = 1
-layout = "NHWC"
-target = tvm.target.Target("cuda")
-dtype = "float32"
-log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
-
-#################################################################
-# Extract Search Tasks
-# --------------------
-# Next, we extract the search tasks and their weights from a network.
-# The weight of a task is the number of appearances of the task's subgraph
-# in the whole network.
-# By using the weight, we can approximate the end-to-end latency of the network
-# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
-# latency of a task and :code:`weight[t]` is the weight of the task.
-# The task scheduler will just optimize this objective.
-
-# Extract tasks from the network
-print("Extract tasks...")
-mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
-tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
-
-for idx, task in enumerate(tasks):
-    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
-    print(task.compute_dag)
-
-#################################################################
-# Begin Tuning
-# ------------
-# Now, we set some options for tuning and launch the search tasks
-#
-# * :code:`measure_ctx` launches a different process for measurement to
-#   provide isolation. It can protect the main process from GPU crashes
-#   during measurement and avoid other runtime conflicts.
-# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
-#   This can warmup the GPU, which is necessary to get accurate measurement results.
-#   Typically, we recommend a value >= 300 ms.
-# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
-#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
-#   In practice, we recommend setting it around :code:`900 * len(tasks)`,
-#   which is typically enough for the search to converge.
-#   For example, there are 24 tasks in resnet-18, so we can set it as 20000.
-#   You can adjust this parameter according to your time budget.
-# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
-#   The measurement records can be used to query the history best, resume the search,
-#   and do more analyses later.
-# * see :any:`auto_scheduler.TuningOptions`,
-#   :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters.
-#
-
-
-def run_tuning():
-    print("Begin tuning...")
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
-
-    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-    tune_option = auto_scheduler.TuningOptions(
-        num_measure_trials=300,  # change this to 20000 to achieve the best performance
-        runner=measure_ctx.runner,
-        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-    )
-
-    tuner.tune(tune_option)
-
-
-# We do not run the tuning in our webpage server since it takes too long.
-# Uncomment the following line to run it by yourself.
-
-run_tuning()
-
-
-######################################################################
-# .. note:: Explain the printed information during tuning
-#
-#   During the tuning, a lot of information will be printed on the console.
-#   They are used for debugging purposes. The most important info is the output
-#   of the task scheduler. The following table is a sample output.
-#
-#   .. code-block:: c
-#
-#     ----------------------------------------------------------------------
-#     ------------------------------  [ Task Scheduler ]
-#     ----------------------------------------------------------------------
-#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-#     -------------------------------------------------
-#     |    0 |        0.005 |           0.88 |     64 |
-#     |    1 |        0.010 |          99.10 |     64 |
-#     |    2 |        0.006 |           0.00 |     64 |
-#     |    3 |        0.145 |         979.78 |    384 |
-#     |    4 |        0.130 |        1097.02 |    384 |
-#     |    5 |        0.143 |         992.69 |    384 |
-#     |    6 |        0.076 |        1526.86 |    192 |
-#     |    7 |        0.115 |         999.44 |    320 |
-#     |    8 |        0.079 |        1449.39 |    320 |
-#     |    9 |        0.122 |         938.73 |    384 |
-#     |   10 |        0.063 |        1832.98 |    192 |
-#     |   11 |        0.072 |        1763.62 |    256 |
-#     |   12 |        0.062 |        2036.40 |    192 |
-#     |   13 |        0.068 |        1874.44 |    192 |
-#     |   14 |        0.049 |        2346.50 |    128 |
-#     |   15 |        0.076 |        1694.31 |    256 |
-#     |   16 |        0.067 |        1933.30 |    448 |
-#     |   17 |        0.076 |        1680.90 |    256 |
-#     |   18 |        0.022 |          98.43 |     64 |
-#     |   19 |        0.076 |        3112.55 |    192 |
-#     |   20 |        0.013 |        2026.44 |     64 |
-#     |   21 |        0.011 |        1136.69 |     64 |
-#     |   22 |        0.013 |         992.47 |     64 |
-#     |   23 |        0.020 |         627.56 |     64 |
-#     -------------------------------------------------
-#     Estimated total latency: 1.587 ms  Trials: 4992  Used time : 13296 s  Next ID: 3
-#
-#   This table lists the latency and (estimated) speed of all tasks.
-#   It also lists the allocation of measurement trials for all tasks.
-#   The last line prints the total weighted latency of these tasks,
-#   which can be a rough estimation of the end-to-end execution time
-#   of the network.
-#   The last line also prints the total number of measurement trials,
-#   total time spent on auto-tuning and the id of the next task to tune.
-#
-#   There will also be some "tvm::Error"s and CUDA errors, because the
-#   auto-scheduler will try some invalid schedules.
-#   You can safely ignore them if the tuning can continue, because these
-#   errors are isolated from the main process.
-#
-
-######################################################################
-# .. note:: Terminate the tuning earlier
-#
-#   You can terminate the tuning earlier by forcibly killing this process.
-#   As long as you get at least one valid schedule for each task in the log file,
-#   you should be able to do the compilation (the secion below).
-#
-
-
-#################################################################
-# Compile and Evaluate
-# --------------------
-# After auto-tuning, we can compile the network with the best schedules we found.
-# All measurement records are dumped into the log file during auto-tuning,
-# so we can read the log file and load the best schedules.
-
-# Compile with the history best
-print("Compile...")
-with auto_scheduler.ApplyHistoryBest(log_file):
-    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
-        lib = relay.build(mod, target=target, params=params)
-
-# Create graph executor
-dev = tvm.device(str(target), 7)
-module = graph_executor.GraphModule(lib["default"](dev))
-data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-module.set_input("data", data_tvm)
-
-# Evaluate
-print("Evaluate inference time cost...")
-print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
-
-
-#################################################################
-# Other Tips
-# ----------
-# 1. During the tuning, the auto-scheduler needs to compile many programs and
-#    extract feature from them. This part is CPU-intensive,
-#    so a high-performance CPU with many cores is recommended for faster search.
-# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
-#    to distill the large log file and only save the best useful records.
-# 3. You can resume a search from the previous log file. You just need to
-#    add a new argument :code:`load_log_file` when creating the task scheduler
-#    in function :code:`run_tuning`. Say,
-#    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
-# 4. If you have multiple target GPUs, you can use all of them for measurements to
-#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-scale-up-rpc-tracker>`
-#    to learn how to use the RPC Tracker and RPC Server.
-#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
-#    with :any:`auto_scheduler.RPCRunner`.