diff --git a/3rdparty/cutlass b/3rdparty/cutlass index c2ee13a0fe99..8a766804ad6f 160000 --- a/3rdparty/cutlass +++ b/3rdparty/cutlass @@ -1 +1 @@ -Subproject commit c2ee13a0fe99241b0e798ce647acf98e237f1d0c +Subproject commit 8a766804ad6ff14b1164fe922c6fe54c131bb02b diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 09511cf9fe5f..21cc7de0dc9f 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 09511cf9fe5ff103900a5eafb50870dc84cc17c8 +Subproject commit 21cc7de0dc9fd6acb796e1be6181fa8e6b6c8f41 diff --git a/3rdparty/rang b/3rdparty/rang index cabe04d6d6b0..22345aa4c468 160000 --- a/3rdparty/rang +++ b/3rdparty/rang @@ -1 +1 @@ -Subproject commit cabe04d6d6b05356fa8f9741704924788f0dd762 +Subproject commit 22345aa4c468db3bd4a0e64a47722aad3518cc81 diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py index a88c1305b560..a7d56672e0ce 100644 --- a/python/tvm/auto_scheduler/search_policy.py +++ b/python/tvm/auto_scheduler/search_policy.py @@ -191,6 +191,7 @@ class SketchPolicy(SearchPolicy): "max_innermost_split_factor": 64, "max_vectorize_size": 16, "disable_change_compute_location": 0, + "ablated_rule_names" : ["RuleCrossThreadReduction", "MutateAutoUnroll"], } def __init__( diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc index 4a4ab18b5eed..afcab3939b07 100644 --- a/src/auto_scheduler/search_policy/sketch_policy.cc +++ b/src/auto_scheduler/search_policy/sketch_policy.cc @@ -80,6 +80,7 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model, node->verbose = verbose; node->sample_init_min_pop_ = GetIntParam(node->params, SketchParamKey::SampleInitPopulation::min_population); + auto ablated_rules = GetIterNameSetParam(node->params, SketchParamKey::ablated_rule_names); if (init_search_callbacks) { PrintTitle("Call init-search callbacks", verbose); @@ -153,6 +154,49 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model, LOG(FATAL) << "No default sketch rules for target: " << task->target; } + // ablating specified rules to measure their impacts + int vp = 0; + for (auto* rule : node->sketch_rules) { + if (ablated_rules.count(rule->GetRuleName())) { + StdCout(verbose) << "Albating sketch rule: " << rule->GetRuleName() << std::endl; + } else { + node->sketch_rules[vp++] = rule; + StdCout(verbose) << "Enable sketch rule: " << rule->GetRuleName() << std::endl; + } + } + if (vp < node->sketch_rules.size()) { + node->sketch_rules.erase(node->sketch_rules.begin() + vp, node->sketch_rules.end()); + StdCout(verbose) << "Sketch rule size: " << node->sketch_rules.size(); + } + + vp = 0; + for (auto* rule : node->init_rules) { + if (ablated_rules.count(rule->GetRuleName())) { + StdCout(verbose) << "Albating init rule: " << rule->GetRuleName() << std::endl; + } else { + node->init_rules[vp++] = rule; + StdCout(verbose) << "Enable init rule: " << rule->GetRuleName() << std::endl; + } + } + if (vp < node->init_rules.size()) { + node->init_rules.erase(node->init_rules.begin() + vp, node->init_rules.end()); + StdCout(verbose) << "Init rule size: " << node->init_rules.size(); + } + + vp = 0; + for (auto rule : node->mutation_rules) { + if (ablated_rules.count(rule->GetRuleName())) { + StdCout(verbose) << "Albating mutation rule: " << rule->GetRuleName() << std::endl; + } else { + node->mutation_rules[vp++] = rule; + StdCout(verbose) << "Enable mutation rule: " << rule->GetRuleName() << std::endl; + } + } + if (vp < node->mutation_rules.size()) { + node->mutation_rules.erase(node->mutation_rules.begin() + vp, node->mutation_rules.end()); + StdCout(verbose) << "Mutation rule size: " << node->mutation_rules.size(); + } + data_ = std::move(node); } diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h index faf058b45b19..2016f04c350b 100644 --- a/src/auto_scheduler/search_policy/sketch_policy.h +++ b/src/auto_scheduler/search_policy/sketch_policy.h @@ -85,6 +85,8 @@ struct SketchParamKey { static constexpr const char* max_vectorize_size = "max_vectorize_size"; /*! \brief Whether disable compute location changing. */ static constexpr const char* disable_change_compute_location = "disable_change_compute_location"; + /*! \brief The list of rules to be ablated */ + static constexpr const char* ablated_rule_names = "ablated_rule_names"; }; class SketchPolicy; diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h index fc1916b8c67d..4d8d6f3e6514 100644 --- a/src/auto_scheduler/search_policy/sketch_policy_rules.h +++ b/src/auto_scheduler/search_policy/sketch_policy_rules.h @@ -171,6 +171,11 @@ class PopulationGenerationRule { */ virtual ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const = 0; + /*! + * \brief Get the name of this rule. + * \return A string of the rule name. + */ + virtual std::string GetRuleName() const = 0; /*! \brief The deconstructor */ virtual ~PopulationGenerationRule() = default; @@ -181,6 +186,7 @@ class PopulationGenerationRule { class rule_name : public PopulationGenerationRule { \ public: \ ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \ + std::string GetRuleName() const final { return #rule_name; } \ }; /*! \brief The rule that fills the incomplete SplitSteps. */ @@ -223,6 +229,7 @@ class PopulationMutationRule : public PopulationGenerationRule { public: \ explicit rule_name(double weight) : PopulationMutationRule(weight) {} \ ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \ + std::string GetRuleName() const final { return #rule_name; } \ }; /*! \brief The rule that mutates tile size by randomly dividing a tile size by a factor diff --git a/workspace/ablate_sketch_rule/ablate.sh b/workspace/ablate_sketch_rule/ablate.sh new file mode 100755 index 000000000000..118d7bb75b3e --- /dev/null +++ b/workspace/ablate_sketch_rule/ablate.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +#ablated_rules="RuleAddCacheRead RuleSpecialComputeLocationGPU RuleAlwaysInline RuleSimplifyComputeWithConstTensor RuleCrossThreadReduction RuleAddCacheWrite RuleMultiLevelTilingWithFusion RuleMultiLevelTiling InitFillTileSize InitThreadBind InitUnroll MutateTileSize MutateAutoUnroll" +ablated_rules="RuleAddCacheWrite RuleMultiLevelTilingWithFusion InitFillTileSize" + +cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"' +#echo "Default Tuning with batch_size=64 at: "$(eval $cur_time) +#python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0615-bs64 > ./log/0615-bs64/default.log 2>&1 + +echo "Begin ablating rules with bs=16 at: "$(eval $cur_time) +for rule in $ablated_rules; do + log_file=./log/0620-pair/bs16-disable-RuleCrossThreadReduction-$rule.log + echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file" + python -u tune_network_cuda.py -b 16 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1 + echo "End at: $(eval $cur_time)" +done + +echo "Begin ablating rules with bs=64 at: "$(eval $cur_time) +for rule in $ablated_rules; do + log_file=./log/0620-pair/bs64-disable-RuleCrossThreadReduction-$rule.log + echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file" + python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1 + echo "End at: $(eval $cur_time)" +done diff --git a/workspace/ablate_sketch_rule/apply_tuned.py b/workspace/ablate_sketch_rule/apply_tuned.py new file mode 100644 index 000000000000..ab2c45f4c7c6 --- /dev/null +++ b/workspace/ablate_sketch_rule/apply_tuned.py @@ -0,0 +1,200 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling a Neural Network for NVIDIA GPU +=============================================== +**Author**: `Lianmin Zheng `_ + +Auto-tuning for specific devices and workloads is critical for getting the +best performance. This is a tutorial on how to tune a whole neural +network for NVIDIA GPU with the auto-scheduler. + +To auto-tune a neural network, we partition the network into small subgraphs and +tune them independently. Each subgraph is treated as one search task. +A task scheduler slices the time and dynamically allocates time resources to +these tasks. The task scheduler predicts the impact of each task on the end-to-end +execution time and prioritizes the one that can reduce the execution time the most. + +For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to +get the computational DAG in the tensor expression form. +We then use the auto-scheduler to construct a search space of this DAG and search +for good schedules (low-level optimizations). + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any +schedule templates. In other words, the auto-scheduler only uses the compute declarations +in :code:`tvm/python/topi` and does not use existing schedule templates. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import numpy as np + +import tvm +from tvm import relay, auto_scheduler +import tvm.relay.testing +from tvm.contrib import graph_executor +import argparse +import os + +################################################################# +# Parse arguments + +def parse_args(): + parser = argparse.ArgumentParser("Evaluate tuned result") + parser.add_argument( + '-b', + '--batch_size', + type=int, + default=16, + help='batch size') + parser.add_argument( + '-d', + '--device_id', + type=int, + default=7, + help='device id to be used' + ) + parser.add_argument( + '--tuned_dir', + default='./result', + help='dirname of tuned result stored' + ) + args = parser.parse_args() + return args + +args = parse_args() +print("Arguments: %s" % args) + +################################################################# +# Define a Network +# ---------------- +# First, we need to define the network with relay frontend API. +# We can load some pre-defined network from :code:`tvm.relay.testing`. +# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow +# (see :ref:`front end tutorials`). +# +# For convolutional neural networks, although auto-scheduler can work correctly +# with any layout, we found the best performance is typically achieved with NHWC layout. +# We also implemented more optimizations for NHWC layout with the auto-scheduler. +# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. +# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. + + +def get_network(name, batch_size, layout="NHWC", dtype="float32"): + """Get the symbol definition and random weight of a network""" + + # auto-scheduler prefers NHWC layout + if layout == "NHWC": + image_shape = (224, 224, 3) + elif layout == "NCHW": + image_shape = (3, 224, 224) + else: + raise ValueError("Invalid layout: " + layout) + + input_shape = (batch_size,) + image_shape + output_shape = (batch_size, 1000) + + if name.startswith("resnet-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name.startswith("resnet3d-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "mobilenet": + mod, params = relay.testing.mobilenet.get_workload( + batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape + ) + elif name == "squeezenet_v1.1": + assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" + mod, params = relay.testing.squeezenet.get_workload( + version="1.1", + batch_size=batch_size, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "inception_v3": + input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) + mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) + elif name == "mxnet": + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + + assert layout == "NCHW" + + block = get_model("resnet18_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + net = mod["main"] + net = relay.Function( + net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + ) + mod = tvm.IRModule.from_expr(net) + + return mod, params, input_shape, output_shape + + +# Define the neural network and compilation target +network = "resnet-50" +batch_size = args.batch_size +layout = "NHWC" +target = tvm.target.Target("cuda") +dtype = "float32" + +# Get network +mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) + +################################################################# +# Compile and Evaluate +# -------------------- +# After auto-tuning, we can compile the network with the best schedules we found. +# All measurement records are dumped into the log file during auto-tuning, +# so we can read the log file and load the best schedules. + +# Compile with the history best +def apply_tuned(log_file): + print("Apply: %s" % log_file) + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): + lib = relay.build(mod, target=target, params=params) + + # Create graph executor + dev = tvm.device(str(target), args.device_id) + module = graph_executor.GraphModule(lib["default"](dev)) + data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) + module.set_input("data", data_tvm) + + # Evaluate + print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) + +for root, dirs, files in os.walk(args.tuned_dir): + for file_name in files: + log_file = os.path.join(root, file_name) + apply_tuned(log_file) diff --git a/workspace/ablate_sketch_rule/evaluate.sh b/workspace/ablate_sketch_rule/evaluate.sh new file mode 100755 index 000000000000..a53ac360f930 --- /dev/null +++ b/workspace/ablate_sketch_rule/evaluate.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"' +echo "Begin evaluationg on: "$(eval $cur_time) +#python -u no_schedule.py -b 16 -d 7 +#python -u apply_tuned.py -b 64 -d 1 --tuned_dir ./result/0615-bs64 +log_file=./log/bs64-default.debug +python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-.json > $log_file 2>&1 & +#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-InitThreadBind.json > $log_file 2>&1 & +#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-MutateAutoUnroll.json > $log_file 2>&1 & +echo "End at: $(eval $cur_time)" diff --git a/workspace/ablate_sketch_rule/print_best.py b/workspace/ablate_sketch_rule/print_best.py new file mode 100644 index 000000000000..f1ac8dd6560c --- /dev/null +++ b/workspace/ablate_sketch_rule/print_best.py @@ -0,0 +1,255 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling a Neural Network for NVIDIA GPU +=============================================== +**Author**: `Lianmin Zheng `_ + +Auto-tuning for specific devices and workloads is critical for getting the +best performance. This is a tutorial on how to tune a whole neural +network for NVIDIA GPU with the auto-scheduler. + +To auto-tune a neural network, we partition the network into small subgraphs and +tune them independently. Each subgraph is treated as one search task. +A task scheduler slices the time and dynamically allocates time resources to +these tasks. The task scheduler predicts the impact of each task on the end-to-end +execution time and prioritizes the one that can reduce the execution time the most. + +For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to +get the computational DAG in the tensor expression form. +We then use the auto-scheduler to construct a search space of this DAG and search +for good schedules (low-level optimizations). + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any +schedule templates. In other words, the auto-scheduler only uses the compute declarations +in :code:`tvm/python/topi` and does not use existing schedule templates. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import numpy as np + +import tvm +from tvm import relay, auto_scheduler +import tvm.relay.testing +from tvm.contrib import graph_executor +import argparse +import os + +################################################################# +# Parse arguments + +def parse_args(): + parser = argparse.ArgumentParser("Evaluate tuned result") + parser.add_argument( + '-b', + '--batch_size', + type=int, + default=16, + help='batch size') + parser.add_argument( + '-d', + '--device_id', + type=int, + default=7, + help='device id to be used' + ) + parser.add_argument( + '--tuned_dir', + default='./result', + help='dirname of tuned result stored' + ) + args = parser.parse_args() + return args + +args = parse_args() +print("Arguments: %s" % args) + +################################################################# +# Define a Network +# ---------------- +# First, we need to define the network with relay frontend API. +# We can load some pre-defined network from :code:`tvm.relay.testing`. +# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow +# (see :ref:`front end tutorials`). +# +# For convolutional neural networks, although auto-scheduler can work correctly +# with any layout, we found the best performance is typically achieved with NHWC layout. +# We also implemented more optimizations for NHWC layout with the auto-scheduler. +# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. +# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. + + +def get_network(name, batch_size, layout="NHWC", dtype="float32"): + """Get the symbol definition and random weight of a network""" + + # auto-scheduler prefers NHWC layout + if layout == "NHWC": + image_shape = (224, 224, 3) + elif layout == "NCHW": + image_shape = (3, 224, 224) + else: + raise ValueError("Invalid layout: " + layout) + + input_shape = (batch_size,) + image_shape + output_shape = (batch_size, 1000) + + if name.startswith("resnet-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name.startswith("resnet3d-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "mobilenet": + mod, params = relay.testing.mobilenet.get_workload( + batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape + ) + elif name == "squeezenet_v1.1": + assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" + mod, params = relay.testing.squeezenet.get_workload( + version="1.1", + batch_size=batch_size, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "inception_v3": + input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) + mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) + elif name == "mxnet": + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + + assert layout == "NCHW" + + block = get_model("resnet18_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + net = mod["main"] + net = relay.Function( + net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + ) + mod = tvm.IRModule.from_expr(net) + + return mod, params, input_shape, output_shape + + +# Define the neural network and compilation target +network = "resnet-50" +batch_size = args.batch_size +layout = "NHWC" +target = tvm.target.Target("cuda") +dtype = "float32" + +mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) + +################################################################# +# Extract Search Tasks +# -------------------- +# Next, we extract the search tasks and their weights from a network. +# The weight of a task is the number of appearances of the task's subgraph +# in the whole network. +# By using the weight, we can approximate the end-to-end latency of the network +# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the +# latency of a task and :code:`weight[t]` is the weight of the task. +# The task scheduler will just optimize this objective. + +# Extract tasks from the network +print("Extract tasks...") +tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) +dev = tvm.device(str(target), args.device_id) + +def debug_tuned_result(log_file): + for idx, task in enumerate(tasks): + print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) + print("Weight:%f" % task_weights[idx]) + compute_dag = task.compute_dag + print("DAG------->") + print(compute_dag) + #sch, args = task.apply_best(log_file) + inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key) + if inp is None: + print("!!!Can't find tuned schedule, skip") + continue + #sch, tensors = compute_dag.apply_steps_from_state(compute_dag.get_init_state()) + else: + sch, tensors = compute_dag.apply_steps_from_state(inp.state) + lowered_module = tvm.lower(sch, tensors, simple_mode=True) + print("TIR------->") + print(lowered_module) + print("TIR AST------->") + print(lowered_module.astext()) + func = tvm.build(sch, tensors, target) + print("CUDA------->") + #print(task.print_best(log_file, print_mode="cuda")) + print(func.imported_modules[0].get_source()) + input_data = [] + for tensor in tensors: + shape = auto_scheduler.utils.get_const_tuple(tensor.shape) + xd = tvm.nd.array((np.random.uniform(size=shape)).astype(dtype), device=dev) + input_data.append(xd) + evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500) + print("Execution time of this task: %.3f ms" % (np.median(evaluator(*input_data).results) * 1000)) + + +################################################################# +# Compile and Evaluate +# -------------------- +# After auto-tuning, we can compile the network with the best schedules we found. +# All measurement records are dumped into the log file during auto-tuning, +# so we can read the log file and load the best schedules. + +# Compile with the history best +def apply_tuned(log_file): + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): + lib = relay.build(mod, target=target, params=params) + + # Create graph executor + dev = tvm.device(str(target), args.device_id) + module = graph_executor.GraphModule(lib["default"](dev)) + data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) + module.set_input("data", data_tvm) + + # Evaluate + print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) + +if os.path.isdir(args.tuned_dir): + for root, dirs, files in os.walk(args.tuned_dir): + for file_name in files: + print("Apply file: %s" % log_file) + log_file = os.path.join(root, file_name) + debug_tuned_result(log_file) + #apply_tuned(log_file) +else: + log_file = args.tuned_dir + print("Apply file: %s" % log_file) + debug_tuned_result(log_file) + diff --git a/workspace/ablate_sketch_rule/tune_network_cuda.py b/workspace/ablate_sketch_rule/tune_network_cuda.py new file mode 100644 index 000000000000..e7074a51739e --- /dev/null +++ b/workspace/ablate_sketch_rule/tune_network_cuda.py @@ -0,0 +1,355 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling a Neural Network for NVIDIA GPU +=============================================== +**Author**: `Lianmin Zheng `_ + +Auto-tuning for specific devices and workloads is critical for getting the +best performance. This is a tutorial on how to tune a whole neural +network for NVIDIA GPU with the auto-scheduler. + +To auto-tune a neural network, we partition the network into small subgraphs and +tune them independently. Each subgraph is treated as one search task. +A task scheduler slices the time and dynamically allocates time resources to +these tasks. The task scheduler predicts the impact of each task on the end-to-end +execution time and prioritizes the one that can reduce the execution time the most. + +For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to +get the computational DAG in the tensor expression form. +We then use the auto-scheduler to construct a search space of this DAG and search +for good schedules (low-level optimizations). + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any +schedule templates. In other words, the auto-scheduler only uses the compute declarations +in :code:`tvm/python/topi` and does not use existing schedule templates. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import numpy as np + +import tvm +from tvm import relay, auto_scheduler +import tvm.relay.testing +from tvm.contrib import graph_executor +import argparse +import os + +################################################################# +# Parse arguments + +def parse_args(): + parser = argparse.ArgumentParser("Tuning arguments") + parser.add_argument( + '-b', + '--batch_size', + type=int, + default=16, + help='batch size') + parser.add_argument( + '-d', + '--device_id', + type=int, + default=7, + help='device id to be used' + ) + parser.add_argument( + '-n', + '--num_measure_trials', + type=int, + default=300, + help='number of trials to be measured' + ) + parser.add_argument( + '--tuned_dir', + default='./result', + help='dirname of tuned result stored' + ) + parser.add_argument( + '-e', + '--ablated_rules', + action='append', + default=[], + help='names of rules to be ablated') + args = parser.parse_args() + return args + +args = parse_args() +print("Arguments: %s" % args) + +################################################################# +# Define a Network +# ---------------- +# First, we need to define the network with relay frontend API. +# We can load some pre-defined network from :code:`tvm.relay.testing`. +# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow +# (see :ref:`front end tutorials`). +# +# For convolutional neural networks, although auto-scheduler can work correctly +# with any layout, we found the best performance is typically achieved with NHWC layout. +# We also implemented more optimizations for NHWC layout with the auto-scheduler. +# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. +# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. + + +def get_network(name, batch_size, layout="NHWC", dtype="float32"): + """Get the symbol definition and random weight of a network""" + + # auto-scheduler prefers NHWC layout + if layout == "NHWC": + image_shape = (224, 224, 3) + elif layout == "NCHW": + image_shape = (3, 224, 224) + else: + raise ValueError("Invalid layout: " + layout) + + input_shape = (batch_size,) + image_shape + output_shape = (batch_size, 1000) + + if name.startswith("resnet-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name.startswith("resnet3d-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "mobilenet": + mod, params = relay.testing.mobilenet.get_workload( + batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape + ) + elif name == "squeezenet_v1.1": + assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" + mod, params = relay.testing.squeezenet.get_workload( + version="1.1", + batch_size=batch_size, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "inception_v3": + input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) + mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) + elif name == "mxnet": + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + + assert layout == "NCHW" + + block = get_model("resnet18_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + net = mod["main"] + net = relay.Function( + net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + ) + mod = tvm.IRModule.from_expr(net) + + return mod, params, input_shape, output_shape + + +# Define the neural network and compilation target +network = "resnet-50" +batch_size = args.batch_size +layout = "NHWC" +target = tvm.target.Target("cuda") +dtype = "float32" +log_name = "%s-%s-B%d-%s.disable-%s.json" % (network, layout, batch_size, target.kind.name, '_'.join(args.ablated_rules)) +log_file = os.path.join(args.tuned_dir, log_name) + +################################################################# +# Extract Search Tasks +# -------------------- +# Next, we extract the search tasks and their weights from a network. +# The weight of a task is the number of appearances of the task's subgraph +# in the whole network. +# By using the weight, we can approximate the end-to-end latency of the network +# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the +# latency of a task and :code:`weight[t]` is the weight of the task. +# The task scheduler will just optimize this objective. + +# Extract tasks from the network +print("Extract tasks...") +mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) +tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) + +for idx, task in enumerate(tasks): + print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) + print(task.compute_dag) + +################################################################# +# Begin Tuning +# ------------ +# Now, we set some options for tuning and launch the search tasks +# +# * :code:`measure_ctx` launches a different process for measurement to +# provide isolation. It can protect the main process from GPU crashes +# during measurement and avoid other runtime conflicts. +# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. +# This can warmup the GPU, which is necessary to get accurate measurement results. +# Typically, we recommend a value >= 300 ms. +# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. +# You can set it to a small number (e.g., 200) for a fast demonstrative run. +# In practice, we recommend setting it around :code:`900 * len(tasks)`, +# which is typically enough for the search to converge. +# For example, there are 24 tasks in resnet-18, so we can set it as 20000. +# You can adjust this parameter according to your time budget. +# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, +# The measurement records can be used to query the history best, resume the search, +# and do more analyses later. +# * see :any:`auto_scheduler.TuningOptions`, +# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. +# + + +def run_tuning(): + print("Begin tuning...") + measure_ctx = auto_scheduler.LocalRPCMeasureContext(n_parallel=2, repeat=1, min_repeat_ms=300, timeout=10, device=args.device_id) + + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=args.num_measure_trials, # change this to 20000 to achieve the best performance + runner=measure_ctx.runner, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + ) + + tuner.tune(tune_option, search_policy_params={'ablated_rule_names' : args.ablated_rules}) + + +# We do not run the tuning in our webpage server since it takes too long. +# Uncomment the following line to run it by yourself. + +run_tuning() + + +###################################################################### +# .. note:: Explain the printed information during tuning +# +# During the tuning, a lot of information will be printed on the console. +# They are used for debugging purposes. The most important info is the output +# of the task scheduler. The following table is a sample output. +# +# .. code-block:: c +# +# ---------------------------------------------------------------------- +# ------------------------------ [ Task Scheduler ] +# ---------------------------------------------------------------------- +# | ID | Latency (ms) | Speed (GFLOPS) | Trials | +# ------------------------------------------------- +# | 0 | 0.005 | 0.88 | 64 | +# | 1 | 0.010 | 99.10 | 64 | +# | 2 | 0.006 | 0.00 | 64 | +# | 3 | 0.145 | 979.78 | 384 | +# | 4 | 0.130 | 1097.02 | 384 | +# | 5 | 0.143 | 992.69 | 384 | +# | 6 | 0.076 | 1526.86 | 192 | +# | 7 | 0.115 | 999.44 | 320 | +# | 8 | 0.079 | 1449.39 | 320 | +# | 9 | 0.122 | 938.73 | 384 | +# | 10 | 0.063 | 1832.98 | 192 | +# | 11 | 0.072 | 1763.62 | 256 | +# | 12 | 0.062 | 2036.40 | 192 | +# | 13 | 0.068 | 1874.44 | 192 | +# | 14 | 0.049 | 2346.50 | 128 | +# | 15 | 0.076 | 1694.31 | 256 | +# | 16 | 0.067 | 1933.30 | 448 | +# | 17 | 0.076 | 1680.90 | 256 | +# | 18 | 0.022 | 98.43 | 64 | +# | 19 | 0.076 | 3112.55 | 192 | +# | 20 | 0.013 | 2026.44 | 64 | +# | 21 | 0.011 | 1136.69 | 64 | +# | 22 | 0.013 | 992.47 | 64 | +# | 23 | 0.020 | 627.56 | 64 | +# ------------------------------------------------- +# Estimated total latency: 1.587 ms Trials: 4992 Used time : 13296 s Next ID: 3 +# +# This table lists the latency and (estimated) speed of all tasks. +# It also lists the allocation of measurement trials for all tasks. +# The last line prints the total weighted latency of these tasks, +# which can be a rough estimation of the end-to-end execution time +# of the network. +# The last line also prints the total number of measurement trials, +# total time spent on auto-tuning and the id of the next task to tune. +# +# There will also be some "tvm::Error"s and CUDA errors, because the +# auto-scheduler will try some invalid schedules. +# You can safely ignore them if the tuning can continue, because these +# errors are isolated from the main process. +# + +###################################################################### +# .. note:: Terminate the tuning earlier +# +# You can terminate the tuning earlier by forcibly killing this process. +# As long as you get at least one valid schedule for each task in the log file, +# you should be able to do the compilation (the secion below). +# + + +################################################################# +# Compile and Evaluate +# -------------------- +# After auto-tuning, we can compile the network with the best schedules we found. +# All measurement records are dumped into the log file during auto-tuning, +# so we can read the log file and load the best schedules. + +# Compile with the history best +print("Compile...") +with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): + lib = relay.build(mod, target=target, params=params) + +# Create graph executor +dev = tvm.device(str(target), args.device_id) +module = graph_executor.GraphModule(lib["default"](dev)) +data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) +module.set_input("data", data_tvm) + +# Evaluate +print("Evaluate inference time cost...") +print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) + + +################################################################# +# Other Tips +# ---------- +# 1. During the tuning, the auto-scheduler needs to compile many programs and +# extract feature from them. This part is CPU-intensive, +# so a high-performance CPU with many cores is recommended for faster search. +# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` +# to distill the large log file and only save the best useful records. +# 3. You can resume a search from the previous log file. You just need to +# add a new argument :code:`load_log_file` when creating the task scheduler +# in function :code:`run_tuning`. Say, +# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` +# 4. If you have multiple target GPUs, you can use all of them for measurements to +# parallelize the measurements. Check this :ref:`section ` +# to learn how to use the RPC Tracker and RPC Server. +# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` +# with :any:`auto_scheduler.RPCRunner`.