CtfGo · CtfGo · Jun 14, 2022 · Jul 11, 2022 · Jul 13, 2022 · Jul 13, 2022
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
diff --git a/3rdparty/rang b/3rdparty/rang
diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
@@ -191,6 +191,7 @@ class SketchPolicy(SearchPolicy):
         "max_innermost_split_factor": 64,
         "max_vectorize_size": 16,
         "disable_change_compute_location": 0,
+        "ablated_rule_names" : ["RuleCrossThreadReduction", "MutateAutoUnroll"],
     }
 
     def __init__(

diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -80,6 +80,7 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
   node->verbose = verbose;
   node->sample_init_min_pop_ =
       GetIntParam(node->params, SketchParamKey::SampleInitPopulation::min_population);
+  auto ablated_rules = GetIterNameSetParam(node->params, SketchParamKey::ablated_rule_names);
 
   if (init_search_callbacks) {
     PrintTitle("Call init-search callbacks", verbose);
@@ -153,6 +154,49 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
     LOG(FATAL) << "No default sketch rules for target: " << task->target;
   }
 
+  // ablating specified rules to measure their impacts
+  int vp = 0;
+  for (auto* rule : node->sketch_rules) {
+      if (ablated_rules.count(rule->GetRuleName())) {
+          StdCout(verbose) << "Albating sketch rule: " << rule->GetRuleName() << std::endl;
+      } else {
+          node->sketch_rules[vp++] = rule;
+          StdCout(verbose) << "Enable sketch rule: " << rule->GetRuleName() << std::endl;
+      }
+  }
+  if (vp < node->sketch_rules.size()) {
+      node->sketch_rules.erase(node->sketch_rules.begin() + vp, node->sketch_rules.end());
+      StdCout(verbose) << "Sketch rule size: " << node->sketch_rules.size();
+  }
+
+  vp = 0;
+  for (auto* rule : node->init_rules) {
+      if (ablated_rules.count(rule->GetRuleName())) {
+          StdCout(verbose) << "Albating init rule: " << rule->GetRuleName() << std::endl;
+      } else {
+          node->init_rules[vp++] = rule;
+          StdCout(verbose) << "Enable init rule: " << rule->GetRuleName() << std::endl;
+      }
+  }
+  if (vp < node->init_rules.size()) {
+      node->init_rules.erase(node->init_rules.begin() + vp, node->init_rules.end());
+      StdCout(verbose) << "Init rule size: " << node->init_rules.size();
+  }
+
+  vp = 0;
+  for (auto rule : node->mutation_rules) {
+      if (ablated_rules.count(rule->GetRuleName())) {
+          StdCout(verbose) << "Albating mutation rule: " << rule->GetRuleName() << std::endl;
+      } else {
+          node->mutation_rules[vp++] = rule;
+          StdCout(verbose) << "Enable mutation rule: " << rule->GetRuleName() << std::endl;
+      }
+  }
+  if (vp < node->mutation_rules.size()) {
+      node->mutation_rules.erase(node->mutation_rules.begin() + vp, node->mutation_rules.end());
+      StdCout(verbose) << "Mutation rule size: " << node->mutation_rules.size();
+  }
+
   data_ = std::move(node);
 }
 

diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h
@@ -85,6 +85,8 @@ struct SketchParamKey {
   static constexpr const char* max_vectorize_size = "max_vectorize_size";
   /*! \brief Whether disable compute location changing. */
   static constexpr const char* disable_change_compute_location = "disable_change_compute_location";
+  /*! \brief The list of rules to be ablated */
+  static constexpr const char* ablated_rule_names = "ablated_rule_names";
 };
 
 class SketchPolicy;

diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h
@@ -171,6 +171,11 @@ class PopulationGenerationRule {
    */
   virtual ResultKind Apply(SketchPolicyNode* policy, State* state,
                            std::mt19937* rand_gen) const = 0;
+  /*!
+   * \brief Get the name of this rule.
+   * \return A string of the rule name.
+   */
+  virtual std::string GetRuleName() const = 0;
 
   /*! \brief The deconstructor */
   virtual ~PopulationGenerationRule() = default;
@@ -181,6 +186,7 @@ class PopulationGenerationRule {
   class rule_name : public PopulationGenerationRule {                                             \
    public:                                                                                        \
     ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \
+    std::string GetRuleName() const final { return #rule_name; }                                  \
   };
 
 /*! \brief The rule that fills the incomplete SplitSteps. */
@@ -223,6 +229,7 @@ class PopulationMutationRule : public PopulationGenerationRule {
    public:                                                                                        \
     explicit rule_name(double weight) : PopulationMutationRule(weight) {}                         \
     ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \
+    std::string GetRuleName() const final { return #rule_name; }                                  \
   };
 
 /*! \brief The rule that mutates tile size by randomly dividing a tile size by a factor

diff --git a/workspace/ablate_sketch_rule/ablate.sh b/workspace/ablate_sketch_rule/ablate.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#ablated_rules="RuleAddCacheRead RuleSpecialComputeLocationGPU RuleAlwaysInline RuleSimplifyComputeWithConstTensor RuleCrossThreadReduction RuleAddCacheWrite RuleMultiLevelTilingWithFusion RuleMultiLevelTiling InitFillTileSize InitThreadBind InitUnroll MutateTileSize MutateAutoUnroll"
+ablated_rules="RuleAddCacheWrite RuleMultiLevelTilingWithFusion InitFillTileSize"
+
+cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"'
+#echo "Default Tuning with batch_size=64 at: "$(eval $cur_time)
+#python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0615-bs64 > ./log/0615-bs64/default.log 2>&1
+
+echo "Begin ablating rules with bs=16 at: "$(eval $cur_time)
+for rule in $ablated_rules; do
+    log_file=./log/0620-pair/bs16-disable-RuleCrossThreadReduction-$rule.log
+    echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file"
+    python -u tune_network_cuda.py -b 16 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1
+    echo "End at: $(eval $cur_time)"
+done
+
+echo "Begin ablating rules with bs=64 at: "$(eval $cur_time)
+for rule in $ablated_rules; do
+    log_file=./log/0620-pair/bs64-disable-RuleCrossThreadReduction-$rule.log
+    echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file"
+    python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1
+    echo "End at: $(eval $cur_time)"
+done
diff --git a/workspace/ablate_sketch_rule/apply_tuned.py b/workspace/ablate_sketch_rule/apply_tuned.py
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for NVIDIA GPU
+===============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for NVIDIA GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_executor
+import argparse
+import os
+
+#################################################################
+# Parse arguments
+
+def parse_args():
+    parser = argparse.ArgumentParser("Evaluate tuned result")
+    parser.add_argument(
+        '-b',
+        '--batch_size',
+        type=int,
+        default=16,
+        help='batch size')
+    parser.add_argument(
+        '-d',
+        '--device_id',
+        type=int,
+        default=7,
+        help='device id to be used'
+    )
+    parser.add_argument(
+        '--tuned_dir',
+        default='./result',
+        help='dirname of tuned result stored'
+    )
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+print("Arguments: %s" % args)
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet18_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target
+network = "resnet-50"
+batch_size = args.batch_size
+layout = "NHWC"
+target = tvm.target.Target("cuda")
+dtype = "float32"
+
+# Get network
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+
+#################################################################
+# Compile and Evaluate
+# --------------------
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+# Compile with the history best
+def apply_tuned(log_file):
+    print("Apply: %s" % log_file)
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+            lib = relay.build(mod, target=target, params=params)
+
+    # Create graph executor
+    dev = tvm.device(str(target), args.device_id)
+    module = graph_executor.GraphModule(lib["default"](dev))
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input("data", data_tvm)
+
+    # Evaluate
+    print(module.benchmark(dev, repeat=3, min_repeat_ms=500))
+
+for root, dirs, files in os.walk(args.tuned_dir):
+    for file_name in files:
+        log_file = os.path.join(root, file_name)
+        apply_tuned(log_file)
diff --git a/workspace/ablate_sketch_rule/evaluate.sh b/workspace/ablate_sketch_rule/evaluate.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"'
+echo "Begin evaluationg on: "$(eval $cur_time)
+#python -u no_schedule.py -b 16 -d 7
+#python -u apply_tuned.py -b 64 -d 1 --tuned_dir ./result/0615-bs64
+log_file=./log/bs64-default.debug
+python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-.json > $log_file 2>&1 &
+#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-InitThreadBind.json > $log_file 2>&1 &
+#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-MutateAutoUnroll.json > $log_file 2>&1 &
+echo "End at: $(eval $cur_time)"
+3 −14		.github/workflows/githubci.yml
+1 −17		CMakeLists.txt
+0 −2		cmake/build_config.h.in
+8 −1		doc/Doxyfile
+8 −10		include/dmlc/base.h
+3 −9		include/dmlc/build_config_default.h
+1 −1		include/dmlc/json.h
+15 −160		include/dmlc/optional.h
+2 −2		include/dmlc/parameter.h
+0 −11		scripts/conda_env.yml
+6 −7		scripts/lint.py
+11 −15		scripts/test_script.sh
+0 −25		src/data.cc
+0 −183		src/data/parquet_parser.h
+1 −1		src/data/text_parser.h
+1 −2		test/.gitignore
+2 −0		test/dmlc_test.mk
+13 −0		test/unittest/dmlc_unittest.mk
+0 −52		test/unittest/unittest_optional.cc
+0 −120		test/unittest/unittest_parquet_parser.cc
+2 −3		test/unittest/unittest_threaditer.cc
+0 −6		test/unittest/unittest_threaditer.h
+9 −2		test/unittest/unittest_threaditer_exc_handling.cc
+1 −1		tracker/dmlc_tracker/sge.py
+0 −154		.travis.yml
+85 −4		CMakeLists.txt
+60 −0		cmake/CMakeUtilities.cmake
+4 −0		cmake/rang-config.cmake.in
+8 −0		cmake/rang.pc.in
+1 −1		conanfile.py
+12 −5		meson.build
+32 −0		test/CMakeLists.txt
+1 −1		test_package/meson.build