From 8c3b97dfe3b1662472616b2168659759eee15cc1 Mon Sep 17 00:00:00 2001 From: CtfGo Date: Tue, 14 Jun 2022 06:14:09 +0000 Subject: [PATCH 1/4] ablation study draft code --- python/tvm/auto_scheduler/search_policy.py | 1 + .../search_policy/sketch_policy.cc | 44 +++ .../search_policy/sketch_policy.h | 2 + .../search_policy/sketch_policy_rules.h | 7 + workspace/ablation_study_on_rule/ablate.sh | 17 + .../ablation_study_on_rule/apply_tuned.py | 200 ++++++++++ workspace/ablation_study_on_rule/evaluate.sh | 5 + .../tune_network_cuda.py | 355 ++++++++++++++++++ 8 files changed, 631 insertions(+) create mode 100755 workspace/ablation_study_on_rule/ablate.sh create mode 100644 workspace/ablation_study_on_rule/apply_tuned.py create mode 100755 workspace/ablation_study_on_rule/evaluate.sh create mode 100644 workspace/ablation_study_on_rule/tune_network_cuda.py diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py index a88c1305b560..a7d56672e0ce 100644 --- a/python/tvm/auto_scheduler/search_policy.py +++ b/python/tvm/auto_scheduler/search_policy.py @@ -191,6 +191,7 @@ class SketchPolicy(SearchPolicy): "max_innermost_split_factor": 64, "max_vectorize_size": 16, "disable_change_compute_location": 0, + "ablated_rule_names" : ["RuleCrossThreadReduction", "MutateAutoUnroll"], } def __init__( diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc index 4a4ab18b5eed..afcab3939b07 100644 --- a/src/auto_scheduler/search_policy/sketch_policy.cc +++ b/src/auto_scheduler/search_policy/sketch_policy.cc @@ -80,6 +80,7 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model, node->verbose = verbose; node->sample_init_min_pop_ = GetIntParam(node->params, SketchParamKey::SampleInitPopulation::min_population); + auto ablated_rules = GetIterNameSetParam(node->params, SketchParamKey::ablated_rule_names); if (init_search_callbacks) { PrintTitle("Call init-search callbacks", verbose); @@ -153,6 +154,49 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model, LOG(FATAL) << "No default sketch rules for target: " << task->target; } + // ablating specified rules to measure their impacts + int vp = 0; + for (auto* rule : node->sketch_rules) { + if (ablated_rules.count(rule->GetRuleName())) { + StdCout(verbose) << "Albating sketch rule: " << rule->GetRuleName() << std::endl; + } else { + node->sketch_rules[vp++] = rule; + StdCout(verbose) << "Enable sketch rule: " << rule->GetRuleName() << std::endl; + } + } + if (vp < node->sketch_rules.size()) { + node->sketch_rules.erase(node->sketch_rules.begin() + vp, node->sketch_rules.end()); + StdCout(verbose) << "Sketch rule size: " << node->sketch_rules.size(); + } + + vp = 0; + for (auto* rule : node->init_rules) { + if (ablated_rules.count(rule->GetRuleName())) { + StdCout(verbose) << "Albating init rule: " << rule->GetRuleName() << std::endl; + } else { + node->init_rules[vp++] = rule; + StdCout(verbose) << "Enable init rule: " << rule->GetRuleName() << std::endl; + } + } + if (vp < node->init_rules.size()) { + node->init_rules.erase(node->init_rules.begin() + vp, node->init_rules.end()); + StdCout(verbose) << "Init rule size: " << node->init_rules.size(); + } + + vp = 0; + for (auto rule : node->mutation_rules) { + if (ablated_rules.count(rule->GetRuleName())) { + StdCout(verbose) << "Albating mutation rule: " << rule->GetRuleName() << std::endl; + } else { + node->mutation_rules[vp++] = rule; + StdCout(verbose) << "Enable mutation rule: " << rule->GetRuleName() << std::endl; + } + } + if (vp < node->mutation_rules.size()) { + node->mutation_rules.erase(node->mutation_rules.begin() + vp, node->mutation_rules.end()); + StdCout(verbose) << "Mutation rule size: " << node->mutation_rules.size(); + } + data_ = std::move(node); } diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h index faf058b45b19..2016f04c350b 100644 --- a/src/auto_scheduler/search_policy/sketch_policy.h +++ b/src/auto_scheduler/search_policy/sketch_policy.h @@ -85,6 +85,8 @@ struct SketchParamKey { static constexpr const char* max_vectorize_size = "max_vectorize_size"; /*! \brief Whether disable compute location changing. */ static constexpr const char* disable_change_compute_location = "disable_change_compute_location"; + /*! \brief The list of rules to be ablated */ + static constexpr const char* ablated_rule_names = "ablated_rule_names"; }; class SketchPolicy; diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h index fc1916b8c67d..4d8d6f3e6514 100644 --- a/src/auto_scheduler/search_policy/sketch_policy_rules.h +++ b/src/auto_scheduler/search_policy/sketch_policy_rules.h @@ -171,6 +171,11 @@ class PopulationGenerationRule { */ virtual ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const = 0; + /*! + * \brief Get the name of this rule. + * \return A string of the rule name. + */ + virtual std::string GetRuleName() const = 0; /*! \brief The deconstructor */ virtual ~PopulationGenerationRule() = default; @@ -181,6 +186,7 @@ class PopulationGenerationRule { class rule_name : public PopulationGenerationRule { \ public: \ ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \ + std::string GetRuleName() const final { return #rule_name; } \ }; /*! \brief The rule that fills the incomplete SplitSteps. */ @@ -223,6 +229,7 @@ class PopulationMutationRule : public PopulationGenerationRule { public: \ explicit rule_name(double weight) : PopulationMutationRule(weight) {} \ ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \ + std::string GetRuleName() const final { return #rule_name; } \ }; /*! \brief The rule that mutates tile size by randomly dividing a tile size by a factor diff --git a/workspace/ablation_study_on_rule/ablate.sh b/workspace/ablation_study_on_rule/ablate.sh new file mode 100755 index 000000000000..a4fdeb63ce74 --- /dev/null +++ b/workspace/ablation_study_on_rule/ablate.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +ablated_rules="RuleAddCacheRead RuleSpecialComputeLocationGPU RuleAlwaysInline RuleSimplifyComputeWithConstTensor RuleCrossThreadReduction RuleAddCacheWrite RuleMultiLevelTilingWithFusion RuleMultiLevelTiling InitFillTileSize InitThreadBind InitUnroll MutateTileSize MutateAutoUnroll" + +cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"' +echo "Default Tuning with bathc_size=16 at: "$(eval $cur_time) +python -u tune_network_cuda.py -b 16 -d 6 -n 300 --tuned_dir ./result/0613-bs16 > ./log/0613-bs16/default.log 2>&1 +echo "Default Tuning with bathc_size=64 at: "$(eval $cur_time) +python -u tune_network_cuda.py -b 64 -d 6 -n 3000 --tuned_dir ./result/0614-bs64 > ./log/0614-bs64/default.log 2>&1 + +echo "Begin ablating rules at: "$(eval $cur_time) +for rule in $ablated_rules; do + log_file=./log/0614-bs64/disable-$rule.log + echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file" + python -u tune_network_cuda.py -b 64 -d 6 -n 3000 --tuned_dir ./result/0614-bs64 -e $rule > $log_file 2>&1 + echo "End at: $(eval $cur_time)" +done diff --git a/workspace/ablation_study_on_rule/apply_tuned.py b/workspace/ablation_study_on_rule/apply_tuned.py new file mode 100644 index 000000000000..ab2c45f4c7c6 --- /dev/null +++ b/workspace/ablation_study_on_rule/apply_tuned.py @@ -0,0 +1,200 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling a Neural Network for NVIDIA GPU +=============================================== +**Author**: `Lianmin Zheng `_ + +Auto-tuning for specific devices and workloads is critical for getting the +best performance. This is a tutorial on how to tune a whole neural +network for NVIDIA GPU with the auto-scheduler. + +To auto-tune a neural network, we partition the network into small subgraphs and +tune them independently. Each subgraph is treated as one search task. +A task scheduler slices the time and dynamically allocates time resources to +these tasks. The task scheduler predicts the impact of each task on the end-to-end +execution time and prioritizes the one that can reduce the execution time the most. + +For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to +get the computational DAG in the tensor expression form. +We then use the auto-scheduler to construct a search space of this DAG and search +for good schedules (low-level optimizations). + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any +schedule templates. In other words, the auto-scheduler only uses the compute declarations +in :code:`tvm/python/topi` and does not use existing schedule templates. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import numpy as np + +import tvm +from tvm import relay, auto_scheduler +import tvm.relay.testing +from tvm.contrib import graph_executor +import argparse +import os + +################################################################# +# Parse arguments + +def parse_args(): + parser = argparse.ArgumentParser("Evaluate tuned result") + parser.add_argument( + '-b', + '--batch_size', + type=int, + default=16, + help='batch size') + parser.add_argument( + '-d', + '--device_id', + type=int, + default=7, + help='device id to be used' + ) + parser.add_argument( + '--tuned_dir', + default='./result', + help='dirname of tuned result stored' + ) + args = parser.parse_args() + return args + +args = parse_args() +print("Arguments: %s" % args) + +################################################################# +# Define a Network +# ---------------- +# First, we need to define the network with relay frontend API. +# We can load some pre-defined network from :code:`tvm.relay.testing`. +# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow +# (see :ref:`front end tutorials`). +# +# For convolutional neural networks, although auto-scheduler can work correctly +# with any layout, we found the best performance is typically achieved with NHWC layout. +# We also implemented more optimizations for NHWC layout with the auto-scheduler. +# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. +# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. + + +def get_network(name, batch_size, layout="NHWC", dtype="float32"): + """Get the symbol definition and random weight of a network""" + + # auto-scheduler prefers NHWC layout + if layout == "NHWC": + image_shape = (224, 224, 3) + elif layout == "NCHW": + image_shape = (3, 224, 224) + else: + raise ValueError("Invalid layout: " + layout) + + input_shape = (batch_size,) + image_shape + output_shape = (batch_size, 1000) + + if name.startswith("resnet-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name.startswith("resnet3d-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "mobilenet": + mod, params = relay.testing.mobilenet.get_workload( + batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape + ) + elif name == "squeezenet_v1.1": + assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" + mod, params = relay.testing.squeezenet.get_workload( + version="1.1", + batch_size=batch_size, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "inception_v3": + input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) + mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) + elif name == "mxnet": + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + + assert layout == "NCHW" + + block = get_model("resnet18_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + net = mod["main"] + net = relay.Function( + net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + ) + mod = tvm.IRModule.from_expr(net) + + return mod, params, input_shape, output_shape + + +# Define the neural network and compilation target +network = "resnet-50" +batch_size = args.batch_size +layout = "NHWC" +target = tvm.target.Target("cuda") +dtype = "float32" + +# Get network +mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) + +################################################################# +# Compile and Evaluate +# -------------------- +# After auto-tuning, we can compile the network with the best schedules we found. +# All measurement records are dumped into the log file during auto-tuning, +# so we can read the log file and load the best schedules. + +# Compile with the history best +def apply_tuned(log_file): + print("Apply: %s" % log_file) + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): + lib = relay.build(mod, target=target, params=params) + + # Create graph executor + dev = tvm.device(str(target), args.device_id) + module = graph_executor.GraphModule(lib["default"](dev)) + data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) + module.set_input("data", data_tvm) + + # Evaluate + print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) + +for root, dirs, files in os.walk(args.tuned_dir): + for file_name in files: + log_file = os.path.join(root, file_name) + apply_tuned(log_file) diff --git a/workspace/ablation_study_on_rule/evaluate.sh b/workspace/ablation_study_on_rule/evaluate.sh new file mode 100755 index 000000000000..d199d62cfd5b --- /dev/null +++ b/workspace/ablation_study_on_rule/evaluate.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"' +echo "Begin evaluationg on: "$(eval $cur_time) +echo "End at: $(eval $cur_time)" diff --git a/workspace/ablation_study_on_rule/tune_network_cuda.py b/workspace/ablation_study_on_rule/tune_network_cuda.py new file mode 100644 index 000000000000..21d44b0b8a34 --- /dev/null +++ b/workspace/ablation_study_on_rule/tune_network_cuda.py @@ -0,0 +1,355 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling a Neural Network for NVIDIA GPU +=============================================== +**Author**: `Lianmin Zheng `_ + +Auto-tuning for specific devices and workloads is critical for getting the +best performance. This is a tutorial on how to tune a whole neural +network for NVIDIA GPU with the auto-scheduler. + +To auto-tune a neural network, we partition the network into small subgraphs and +tune them independently. Each subgraph is treated as one search task. +A task scheduler slices the time and dynamically allocates time resources to +these tasks. The task scheduler predicts the impact of each task on the end-to-end +execution time and prioritizes the one that can reduce the execution time the most. + +For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to +get the computational DAG in the tensor expression form. +We then use the auto-scheduler to construct a search space of this DAG and search +for good schedules (low-level optimizations). + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any +schedule templates. In other words, the auto-scheduler only uses the compute declarations +in :code:`tvm/python/topi` and does not use existing schedule templates. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import numpy as np + +import tvm +from tvm import relay, auto_scheduler +import tvm.relay.testing +from tvm.contrib import graph_executor +import argparse +import os + +################################################################# +# Parse arguments + +def parse_args(): + parser = argparse.ArgumentParser("Tuning arguments") + parser.add_argument( + '-b', + '--batch_size', + type=int, + default=16, + help='batch size') + parser.add_argument( + '-d', + '--device_id', + type=int, + default=7, + help='device id to be used' + ) + parser.add_argument( + '-n', + '--num_measure_trials', + type=int, + default=300, + help='number of trials to be measured' + ) + parser.add_argument( + '--tuned_dir', + default='./result', + help='dirname of tuned result stored' + ) + parser.add_argument( + '-e', + '--ablated_rules', + action='append', + default=[], + help='names of rules to be ablated') + args = parser.parse_args() + return args + +args = parse_args() +print("Arguments: %s" % args) + +################################################################# +# Define a Network +# ---------------- +# First, we need to define the network with relay frontend API. +# We can load some pre-defined network from :code:`tvm.relay.testing`. +# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow +# (see :ref:`front end tutorials`). +# +# For convolutional neural networks, although auto-scheduler can work correctly +# with any layout, we found the best performance is typically achieved with NHWC layout. +# We also implemented more optimizations for NHWC layout with the auto-scheduler. +# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. +# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. + + +def get_network(name, batch_size, layout="NHWC", dtype="float32"): + """Get the symbol definition and random weight of a network""" + + # auto-scheduler prefers NHWC layout + if layout == "NHWC": + image_shape = (224, 224, 3) + elif layout == "NCHW": + image_shape = (3, 224, 224) + else: + raise ValueError("Invalid layout: " + layout) + + input_shape = (batch_size,) + image_shape + output_shape = (batch_size, 1000) + + if name.startswith("resnet-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name.startswith("resnet3d-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "mobilenet": + mod, params = relay.testing.mobilenet.get_workload( + batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape + ) + elif name == "squeezenet_v1.1": + assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" + mod, params = relay.testing.squeezenet.get_workload( + version="1.1", + batch_size=batch_size, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "inception_v3": + input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) + mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) + elif name == "mxnet": + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + + assert layout == "NCHW" + + block = get_model("resnet18_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + net = mod["main"] + net = relay.Function( + net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + ) + mod = tvm.IRModule.from_expr(net) + + return mod, params, input_shape, output_shape + + +# Define the neural network and compilation target +network = "resnet-50" +batch_size = args.batch_size +layout = "NHWC" +target = tvm.target.Target("cuda") +dtype = "float32" +log_name = "%s-%s-B%d-%s.disable-%s.json" % (network, layout, batch_size, target.kind.name, '_'.join(args.ablated_rules)) +log_file = os.path.join(args.tuned_dir, log_name) + +################################################################# +# Extract Search Tasks +# -------------------- +# Next, we extract the search tasks and their weights from a network. +# The weight of a task is the number of appearances of the task's subgraph +# in the whole network. +# By using the weight, we can approximate the end-to-end latency of the network +# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the +# latency of a task and :code:`weight[t]` is the weight of the task. +# The task scheduler will just optimize this objective. + +# Extract tasks from the network +print("Extract tasks...") +mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) +tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) + +for idx, task in enumerate(tasks): + print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) + print(task.compute_dag) + +################################################################# +# Begin Tuning +# ------------ +# Now, we set some options for tuning and launch the search tasks +# +# * :code:`measure_ctx` launches a different process for measurement to +# provide isolation. It can protect the main process from GPU crashes +# during measurement and avoid other runtime conflicts. +# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. +# This can warmup the GPU, which is necessary to get accurate measurement results. +# Typically, we recommend a value >= 300 ms. +# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. +# You can set it to a small number (e.g., 200) for a fast demonstrative run. +# In practice, we recommend setting it around :code:`900 * len(tasks)`, +# which is typically enough for the search to converge. +# For example, there are 24 tasks in resnet-18, so we can set it as 20000. +# You can adjust this parameter according to your time budget. +# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, +# The measurement records can be used to query the history best, resume the search, +# and do more analyses later. +# * see :any:`auto_scheduler.TuningOptions`, +# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. +# + + +def run_tuning(): + print("Begin tuning...") + measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10) + + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=args.num_measure_trials, # change this to 20000 to achieve the best performance + runner=measure_ctx.runner, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + ) + + tuner.tune(tune_option, search_policy_params={'ablated_rule_names' : args.ablated_rules}) + + +# We do not run the tuning in our webpage server since it takes too long. +# Uncomment the following line to run it by yourself. + +run_tuning() + + +###################################################################### +# .. note:: Explain the printed information during tuning +# +# During the tuning, a lot of information will be printed on the console. +# They are used for debugging purposes. The most important info is the output +# of the task scheduler. The following table is a sample output. +# +# .. code-block:: c +# +# ---------------------------------------------------------------------- +# ------------------------------ [ Task Scheduler ] +# ---------------------------------------------------------------------- +# | ID | Latency (ms) | Speed (GFLOPS) | Trials | +# ------------------------------------------------- +# | 0 | 0.005 | 0.88 | 64 | +# | 1 | 0.010 | 99.10 | 64 | +# | 2 | 0.006 | 0.00 | 64 | +# | 3 | 0.145 | 979.78 | 384 | +# | 4 | 0.130 | 1097.02 | 384 | +# | 5 | 0.143 | 992.69 | 384 | +# | 6 | 0.076 | 1526.86 | 192 | +# | 7 | 0.115 | 999.44 | 320 | +# | 8 | 0.079 | 1449.39 | 320 | +# | 9 | 0.122 | 938.73 | 384 | +# | 10 | 0.063 | 1832.98 | 192 | +# | 11 | 0.072 | 1763.62 | 256 | +# | 12 | 0.062 | 2036.40 | 192 | +# | 13 | 0.068 | 1874.44 | 192 | +# | 14 | 0.049 | 2346.50 | 128 | +# | 15 | 0.076 | 1694.31 | 256 | +# | 16 | 0.067 | 1933.30 | 448 | +# | 17 | 0.076 | 1680.90 | 256 | +# | 18 | 0.022 | 98.43 | 64 | +# | 19 | 0.076 | 3112.55 | 192 | +# | 20 | 0.013 | 2026.44 | 64 | +# | 21 | 0.011 | 1136.69 | 64 | +# | 22 | 0.013 | 992.47 | 64 | +# | 23 | 0.020 | 627.56 | 64 | +# ------------------------------------------------- +# Estimated total latency: 1.587 ms Trials: 4992 Used time : 13296 s Next ID: 3 +# +# This table lists the latency and (estimated) speed of all tasks. +# It also lists the allocation of measurement trials for all tasks. +# The last line prints the total weighted latency of these tasks, +# which can be a rough estimation of the end-to-end execution time +# of the network. +# The last line also prints the total number of measurement trials, +# total time spent on auto-tuning and the id of the next task to tune. +# +# There will also be some "tvm::Error"s and CUDA errors, because the +# auto-scheduler will try some invalid schedules. +# You can safely ignore them if the tuning can continue, because these +# errors are isolated from the main process. +# + +###################################################################### +# .. note:: Terminate the tuning earlier +# +# You can terminate the tuning earlier by forcibly killing this process. +# As long as you get at least one valid schedule for each task in the log file, +# you should be able to do the compilation (the secion below). +# + + +################################################################# +# Compile and Evaluate +# -------------------- +# After auto-tuning, we can compile the network with the best schedules we found. +# All measurement records are dumped into the log file during auto-tuning, +# so we can read the log file and load the best schedules. + +# Compile with the history best +print("Compile...") +with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): + lib = relay.build(mod, target=target, params=params) + +# Create graph executor +dev = tvm.device(str(target), args.device_id) +module = graph_executor.GraphModule(lib["default"](dev)) +data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) +module.set_input("data", data_tvm) + +# Evaluate +print("Evaluate inference time cost...") +print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) + + +################################################################# +# Other Tips +# ---------- +# 1. During the tuning, the auto-scheduler needs to compile many programs and +# extract feature from them. This part is CPU-intensive, +# so a high-performance CPU with many cores is recommended for faster search. +# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` +# to distill the large log file and only save the best useful records. +# 3. You can resume a search from the previous log file. You just need to +# add a new argument :code:`load_log_file` when creating the task scheduler +# in function :code:`run_tuning`. Say, +# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` +# 4. If you have multiple target GPUs, you can use all of them for measurements to +# parallelize the measurements. Check this :ref:`section ` +# to learn how to use the RPC Tracker and RPC Server. +# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` +# with :any:`auto_scheduler.RPCRunner`. From da68e96c4d60f3562a317d972887749fa094f511 Mon Sep 17 00:00:00 2001 From: CtfGo Date: Mon, 11 Jul 2022 12:48:21 +0000 Subject: [PATCH 2/4] push local files --- workspace/ablation_study_on_rule/ablate.sh | 23 +- workspace/ablation_study_on_rule/evaluate.sh | 6 + .../ablation_study_on_rule/no_schedule.py | 354 ++++++++++++++++++ .../ablation_study_on_rule/print_best.py | 255 +++++++++++++ .../tune_network_cuda.py | 2 +- workspace/compile.sh | 3 + workspace/default_resnet50.py | 310 +++++++++++++++ workspace/tune_network_cuda.py | 310 +++++++++++++++ 8 files changed, 1254 insertions(+), 9 deletions(-) create mode 100644 workspace/ablation_study_on_rule/no_schedule.py create mode 100644 workspace/ablation_study_on_rule/print_best.py create mode 100644 workspace/compile.sh create mode 100644 workspace/default_resnet50.py create mode 100644 workspace/tune_network_cuda.py diff --git a/workspace/ablation_study_on_rule/ablate.sh b/workspace/ablation_study_on_rule/ablate.sh index a4fdeb63ce74..118d7bb75b3e 100755 --- a/workspace/ablation_study_on_rule/ablate.sh +++ b/workspace/ablation_study_on_rule/ablate.sh @@ -1,17 +1,24 @@ #!/bin/bash -ablated_rules="RuleAddCacheRead RuleSpecialComputeLocationGPU RuleAlwaysInline RuleSimplifyComputeWithConstTensor RuleCrossThreadReduction RuleAddCacheWrite RuleMultiLevelTilingWithFusion RuleMultiLevelTiling InitFillTileSize InitThreadBind InitUnroll MutateTileSize MutateAutoUnroll" +#ablated_rules="RuleAddCacheRead RuleSpecialComputeLocationGPU RuleAlwaysInline RuleSimplifyComputeWithConstTensor RuleCrossThreadReduction RuleAddCacheWrite RuleMultiLevelTilingWithFusion RuleMultiLevelTiling InitFillTileSize InitThreadBind InitUnroll MutateTileSize MutateAutoUnroll" +ablated_rules="RuleAddCacheWrite RuleMultiLevelTilingWithFusion InitFillTileSize" cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"' -echo "Default Tuning with bathc_size=16 at: "$(eval $cur_time) -python -u tune_network_cuda.py -b 16 -d 6 -n 300 --tuned_dir ./result/0613-bs16 > ./log/0613-bs16/default.log 2>&1 -echo "Default Tuning with bathc_size=64 at: "$(eval $cur_time) -python -u tune_network_cuda.py -b 64 -d 6 -n 3000 --tuned_dir ./result/0614-bs64 > ./log/0614-bs64/default.log 2>&1 +#echo "Default Tuning with batch_size=64 at: "$(eval $cur_time) +#python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0615-bs64 > ./log/0615-bs64/default.log 2>&1 -echo "Begin ablating rules at: "$(eval $cur_time) +echo "Begin ablating rules with bs=16 at: "$(eval $cur_time) for rule in $ablated_rules; do - log_file=./log/0614-bs64/disable-$rule.log + log_file=./log/0620-pair/bs16-disable-RuleCrossThreadReduction-$rule.log echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file" - python -u tune_network_cuda.py -b 64 -d 6 -n 3000 --tuned_dir ./result/0614-bs64 -e $rule > $log_file 2>&1 + python -u tune_network_cuda.py -b 16 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1 + echo "End at: $(eval $cur_time)" +done + +echo "Begin ablating rules with bs=64 at: "$(eval $cur_time) +for rule in $ablated_rules; do + log_file=./log/0620-pair/bs64-disable-RuleCrossThreadReduction-$rule.log + echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file" + python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1 echo "End at: $(eval $cur_time)" done diff --git a/workspace/ablation_study_on_rule/evaluate.sh b/workspace/ablation_study_on_rule/evaluate.sh index d199d62cfd5b..a53ac360f930 100755 --- a/workspace/ablation_study_on_rule/evaluate.sh +++ b/workspace/ablation_study_on_rule/evaluate.sh @@ -2,4 +2,10 @@ cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"' echo "Begin evaluationg on: "$(eval $cur_time) +#python -u no_schedule.py -b 16 -d 7 +#python -u apply_tuned.py -b 64 -d 1 --tuned_dir ./result/0615-bs64 +log_file=./log/bs64-default.debug +python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-.json > $log_file 2>&1 & +#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-InitThreadBind.json > $log_file 2>&1 & +#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-MutateAutoUnroll.json > $log_file 2>&1 & echo "End at: $(eval $cur_time)" diff --git a/workspace/ablation_study_on_rule/no_schedule.py b/workspace/ablation_study_on_rule/no_schedule.py new file mode 100644 index 000000000000..2007b863e367 --- /dev/null +++ b/workspace/ablation_study_on_rule/no_schedule.py @@ -0,0 +1,354 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling a Neural Network for NVIDIA GPU +=============================================== +**Author**: `Lianmin Zheng `_ + +Auto-tuning for specific devices and workloads is critical for getting the +best performance. This is a tutorial on how to tune a whole neural +network for NVIDIA GPU with the auto-scheduler. + +To auto-tune a neural network, we partition the network into small subgraphs and +tune them independently. Each subgraph is treated as one search task. +A task scheduler slices the time and dynamically allocates time resources to +these tasks. The task scheduler predicts the impact of each task on the end-to-end +execution time and prioritizes the one that can reduce the execution time the most. + +For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to +get the computational DAG in the tensor expression form. +We then use the auto-scheduler to construct a search space of this DAG and search +for good schedules (low-level optimizations). + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any +schedule templates. In other words, the auto-scheduler only uses the compute declarations +in :code:`tvm/python/topi` and does not use existing schedule templates. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import numpy as np + +import tvm +from tvm import relay, auto_scheduler +import tvm.relay.testing +from tvm.contrib import graph_executor +import argparse +import os + +################################################################# +# Parse arguments + +def parse_args(): + parser = argparse.ArgumentParser("Tuning arguments") + parser.add_argument( + '-b', + '--batch_size', + type=int, + default=16, + help='batch size') + parser.add_argument( + '-d', + '--device_id', + type=int, + default=7, + help='device id to be used' + ) + parser.add_argument( + '-n', + '--num_measure_trials', + type=int, + default=300, + help='number of trials to be measured' + ) + parser.add_argument( + '--tuned_dir', + default='./result', + help='dirname of tuned result stored' + ) + parser.add_argument( + '-e', + '--ablated_rules', + action='append', + default=[], + help='names of rules to be ablated') + args = parser.parse_args() + return args + +args = parse_args() +print("Arguments: %s" % args) + +################################################################# +# Define a Network +# ---------------- +# First, we need to define the network with relay frontend API. +# We can load some pre-defined network from :code:`tvm.relay.testing`. +# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow +# (see :ref:`front end tutorials`). +# +# For convolutional neural networks, although auto-scheduler can work correctly +# with any layout, we found the best performance is typically achieved with NHWC layout. +# We also implemented more optimizations for NHWC layout with the auto-scheduler. +# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. +# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. + + +def get_network(name, batch_size, layout="NHWC", dtype="float32"): + """Get the symbol definition and random weight of a network""" + + # auto-scheduler prefers NHWC layout + if layout == "NHWC": + image_shape = (224, 224, 3) + elif layout == "NCHW": + image_shape = (3, 224, 224) + else: + raise ValueError("Invalid layout: " + layout) + + input_shape = (batch_size,) + image_shape + output_shape = (batch_size, 1000) + + if name.startswith("resnet-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name.startswith("resnet3d-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "mobilenet": + mod, params = relay.testing.mobilenet.get_workload( + batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape + ) + elif name == "squeezenet_v1.1": + assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" + mod, params = relay.testing.squeezenet.get_workload( + version="1.1", + batch_size=batch_size, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "inception_v3": + input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) + mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) + elif name == "mxnet": + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + + assert layout == "NCHW" + + block = get_model("resnet18_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + net = mod["main"] + net = relay.Function( + net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + ) + mod = tvm.IRModule.from_expr(net) + + return mod, params, input_shape, output_shape + + +# Define the neural network and compilation target +network = "resnet-50" +batch_size = args.batch_size +layout = "NHWC" +target = tvm.target.Target("cuda") +dtype = "float32" +log_name = "%s-%s-B%d-%s.disable-%s.json" % (network, layout, batch_size, target.kind.name, '_'.join(args.ablated_rules)) +log_file = os.path.join(args.tuned_dir, log_name) + +################################################################# +# Extract Search Tasks +# -------------------- +# Next, we extract the search tasks and their weights from a network. +# The weight of a task is the number of appearances of the task's subgraph +# in the whole network. +# By using the weight, we can approximate the end-to-end latency of the network +# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the +# latency of a task and :code:`weight[t]` is the weight of the task. +# The task scheduler will just optimize this objective. + +# Extract tasks from the network +print("Extract tasks...") +mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) +#tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) +# +#for idx, task in enumerate(tasks): +# print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) +# print(task.compute_dag) + +################################################################# +# Begin Tuning +# ------------ +# Now, we set some options for tuning and launch the search tasks +# +# * :code:`measure_ctx` launches a different process for measurement to +# provide isolation. It can protect the main process from GPU crashes +# during measurement and avoid other runtime conflicts. +# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. +# This can warmup the GPU, which is necessary to get accurate measurement results. +# Typically, we recommend a value >= 300 ms. +# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. +# You can set it to a small number (e.g., 200) for a fast demonstrative run. +# In practice, we recommend setting it around :code:`900 * len(tasks)`, +# which is typically enough for the search to converge. +# For example, there are 24 tasks in resnet-18, so we can set it as 20000. +# You can adjust this parameter according to your time budget. +# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, +# The measurement records can be used to query the history best, resume the search, +# and do more analyses later. +# * see :any:`auto_scheduler.TuningOptions`, +# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. +# + + +def run_tuning(): + print("Begin tuning...") + measure_ctx = auto_scheduler.LocalRPCMeasureContext(n_parallel=2, repeat=1, min_repeat_ms=300, timeout=10, device=args.device_id) + + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=args.num_measure_trials, # change this to 20000 to achieve the best performance + runner=measure_ctx.runner, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + ) + + tuner.tune(tune_option, search_policy_params={'ablated_rule_names' : args.ablated_rules}) + + +# We do not run the tuning in our webpage server since it takes too long. +# Uncomment the following line to run it by yourself. + +#run_tuning() + + +###################################################################### +# .. note:: Explain the printed information during tuning +# +# During the tuning, a lot of information will be printed on the console. +# They are used for debugging purposes. The most important info is the output +# of the task scheduler. The following table is a sample output. +# +# .. code-block:: c +# +# ---------------------------------------------------------------------- +# ------------------------------ [ Task Scheduler ] +# ---------------------------------------------------------------------- +# | ID | Latency (ms) | Speed (GFLOPS) | Trials | +# ------------------------------------------------- +# | 0 | 0.005 | 0.88 | 64 | +# | 1 | 0.010 | 99.10 | 64 | +# | 2 | 0.006 | 0.00 | 64 | +# | 3 | 0.145 | 979.78 | 384 | +# | 4 | 0.130 | 1097.02 | 384 | +# | 5 | 0.143 | 992.69 | 384 | +# | 6 | 0.076 | 1526.86 | 192 | +# | 7 | 0.115 | 999.44 | 320 | +# | 8 | 0.079 | 1449.39 | 320 | +# | 9 | 0.122 | 938.73 | 384 | +# | 10 | 0.063 | 1832.98 | 192 | +# | 11 | 0.072 | 1763.62 | 256 | +# | 12 | 0.062 | 2036.40 | 192 | +# | 13 | 0.068 | 1874.44 | 192 | +# | 14 | 0.049 | 2346.50 | 128 | +# | 15 | 0.076 | 1694.31 | 256 | +# | 16 | 0.067 | 1933.30 | 448 | +# | 17 | 0.076 | 1680.90 | 256 | +# | 18 | 0.022 | 98.43 | 64 | +# | 19 | 0.076 | 3112.55 | 192 | +# | 20 | 0.013 | 2026.44 | 64 | +# | 21 | 0.011 | 1136.69 | 64 | +# | 22 | 0.013 | 992.47 | 64 | +# | 23 | 0.020 | 627.56 | 64 | +# ------------------------------------------------- +# Estimated total latency: 1.587 ms Trials: 4992 Used time : 13296 s Next ID: 3 +# +# This table lists the latency and (estimated) speed of all tasks. +# It also lists the allocation of measurement trials for all tasks. +# The last line prints the total weighted latency of these tasks, +# which can be a rough estimation of the end-to-end execution time +# of the network. +# The last line also prints the total number of measurement trials, +# total time spent on auto-tuning and the id of the next task to tune. +# +# There will also be some "tvm::Error"s and CUDA errors, because the +# auto-scheduler will try some invalid schedules. +# You can safely ignore them if the tuning can continue, because these +# errors are isolated from the main process. +# + +###################################################################### +# .. note:: Terminate the tuning earlier +# +# You can terminate the tuning earlier by forcibly killing this process. +# As long as you get at least one valid schedule for each task in the log file, +# you should be able to do the compilation (the secion below). +# + + +################################################################# +# Compile and Evaluate +# -------------------- +# After auto-tuning, we can compile the network with the best schedules we found. +# All measurement records are dumped into the log file during auto-tuning, +# so we can read the log file and load the best schedules. + +# Compile with the history best +print("Compile...") +with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): + lib = relay.build(mod, target=target, params=params) + +# Create graph executor +dev = tvm.device(str(target), args.device_id) +module = graph_executor.GraphModule(lib["default"](dev)) +data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) +module.set_input("data", data_tvm) + +# Evaluate +print("Evaluate inference time cost...") +print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) + + +################################################################# +# Other Tips +# ---------- +# 1. During the tuning, the auto-scheduler needs to compile many programs and +# extract feature from them. This part is CPU-intensive, +# so a high-performance CPU with many cores is recommended for faster search. +# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` +# to distill the large log file and only save the best useful records. +# 3. You can resume a search from the previous log file. You just need to +# add a new argument :code:`load_log_file` when creating the task scheduler +# in function :code:`run_tuning`. Say, +# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` +# 4. If you have multiple target GPUs, you can use all of them for measurements to +# parallelize the measurements. Check this :ref:`section ` +# to learn how to use the RPC Tracker and RPC Server. +# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` +# with :any:`auto_scheduler.RPCRunner`. diff --git a/workspace/ablation_study_on_rule/print_best.py b/workspace/ablation_study_on_rule/print_best.py new file mode 100644 index 000000000000..f1ac8dd6560c --- /dev/null +++ b/workspace/ablation_study_on_rule/print_best.py @@ -0,0 +1,255 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling a Neural Network for NVIDIA GPU +=============================================== +**Author**: `Lianmin Zheng `_ + +Auto-tuning for specific devices and workloads is critical for getting the +best performance. This is a tutorial on how to tune a whole neural +network for NVIDIA GPU with the auto-scheduler. + +To auto-tune a neural network, we partition the network into small subgraphs and +tune them independently. Each subgraph is treated as one search task. +A task scheduler slices the time and dynamically allocates time resources to +these tasks. The task scheduler predicts the impact of each task on the end-to-end +execution time and prioritizes the one that can reduce the execution time the most. + +For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to +get the computational DAG in the tensor expression form. +We then use the auto-scheduler to construct a search space of this DAG and search +for good schedules (low-level optimizations). + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any +schedule templates. In other words, the auto-scheduler only uses the compute declarations +in :code:`tvm/python/topi` and does not use existing schedule templates. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import numpy as np + +import tvm +from tvm import relay, auto_scheduler +import tvm.relay.testing +from tvm.contrib import graph_executor +import argparse +import os + +################################################################# +# Parse arguments + +def parse_args(): + parser = argparse.ArgumentParser("Evaluate tuned result") + parser.add_argument( + '-b', + '--batch_size', + type=int, + default=16, + help='batch size') + parser.add_argument( + '-d', + '--device_id', + type=int, + default=7, + help='device id to be used' + ) + parser.add_argument( + '--tuned_dir', + default='./result', + help='dirname of tuned result stored' + ) + args = parser.parse_args() + return args + +args = parse_args() +print("Arguments: %s" % args) + +################################################################# +# Define a Network +# ---------------- +# First, we need to define the network with relay frontend API. +# We can load some pre-defined network from :code:`tvm.relay.testing`. +# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow +# (see :ref:`front end tutorials`). +# +# For convolutional neural networks, although auto-scheduler can work correctly +# with any layout, we found the best performance is typically achieved with NHWC layout. +# We also implemented more optimizations for NHWC layout with the auto-scheduler. +# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. +# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. + + +def get_network(name, batch_size, layout="NHWC", dtype="float32"): + """Get the symbol definition and random weight of a network""" + + # auto-scheduler prefers NHWC layout + if layout == "NHWC": + image_shape = (224, 224, 3) + elif layout == "NCHW": + image_shape = (3, 224, 224) + else: + raise ValueError("Invalid layout: " + layout) + + input_shape = (batch_size,) + image_shape + output_shape = (batch_size, 1000) + + if name.startswith("resnet-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name.startswith("resnet3d-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "mobilenet": + mod, params = relay.testing.mobilenet.get_workload( + batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape + ) + elif name == "squeezenet_v1.1": + assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" + mod, params = relay.testing.squeezenet.get_workload( + version="1.1", + batch_size=batch_size, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "inception_v3": + input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) + mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) + elif name == "mxnet": + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + + assert layout == "NCHW" + + block = get_model("resnet18_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + net = mod["main"] + net = relay.Function( + net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + ) + mod = tvm.IRModule.from_expr(net) + + return mod, params, input_shape, output_shape + + +# Define the neural network and compilation target +network = "resnet-50" +batch_size = args.batch_size +layout = "NHWC" +target = tvm.target.Target("cuda") +dtype = "float32" + +mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) + +################################################################# +# Extract Search Tasks +# -------------------- +# Next, we extract the search tasks and their weights from a network. +# The weight of a task is the number of appearances of the task's subgraph +# in the whole network. +# By using the weight, we can approximate the end-to-end latency of the network +# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the +# latency of a task and :code:`weight[t]` is the weight of the task. +# The task scheduler will just optimize this objective. + +# Extract tasks from the network +print("Extract tasks...") +tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) +dev = tvm.device(str(target), args.device_id) + +def debug_tuned_result(log_file): + for idx, task in enumerate(tasks): + print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) + print("Weight:%f" % task_weights[idx]) + compute_dag = task.compute_dag + print("DAG------->") + print(compute_dag) + #sch, args = task.apply_best(log_file) + inp, _ = auto_scheduler.load_best_record(log_file, task.workload_key) + if inp is None: + print("!!!Can't find tuned schedule, skip") + continue + #sch, tensors = compute_dag.apply_steps_from_state(compute_dag.get_init_state()) + else: + sch, tensors = compute_dag.apply_steps_from_state(inp.state) + lowered_module = tvm.lower(sch, tensors, simple_mode=True) + print("TIR------->") + print(lowered_module) + print("TIR AST------->") + print(lowered_module.astext()) + func = tvm.build(sch, tensors, target) + print("CUDA------->") + #print(task.print_best(log_file, print_mode="cuda")) + print(func.imported_modules[0].get_source()) + input_data = [] + for tensor in tensors: + shape = auto_scheduler.utils.get_const_tuple(tensor.shape) + xd = tvm.nd.array((np.random.uniform(size=shape)).astype(dtype), device=dev) + input_data.append(xd) + evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500) + print("Execution time of this task: %.3f ms" % (np.median(evaluator(*input_data).results) * 1000)) + + +################################################################# +# Compile and Evaluate +# -------------------- +# After auto-tuning, we can compile the network with the best schedules we found. +# All measurement records are dumped into the log file during auto-tuning, +# so we can read the log file and load the best schedules. + +# Compile with the history best +def apply_tuned(log_file): + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): + lib = relay.build(mod, target=target, params=params) + + # Create graph executor + dev = tvm.device(str(target), args.device_id) + module = graph_executor.GraphModule(lib["default"](dev)) + data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) + module.set_input("data", data_tvm) + + # Evaluate + print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) + +if os.path.isdir(args.tuned_dir): + for root, dirs, files in os.walk(args.tuned_dir): + for file_name in files: + print("Apply file: %s" % log_file) + log_file = os.path.join(root, file_name) + debug_tuned_result(log_file) + #apply_tuned(log_file) +else: + log_file = args.tuned_dir + print("Apply file: %s" % log_file) + debug_tuned_result(log_file) + diff --git a/workspace/ablation_study_on_rule/tune_network_cuda.py b/workspace/ablation_study_on_rule/tune_network_cuda.py index 21d44b0b8a34..e7074a51739e 100644 --- a/workspace/ablation_study_on_rule/tune_network_cuda.py +++ b/workspace/ablation_study_on_rule/tune_network_cuda.py @@ -230,7 +230,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"): def run_tuning(): print("Begin tuning...") - measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10) + measure_ctx = auto_scheduler.LocalRPCMeasureContext(n_parallel=2, repeat=1, min_repeat_ms=300, timeout=10, device=args.device_id) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( diff --git a/workspace/compile.sh b/workspace/compile.sh new file mode 100644 index 000000000000..82982219e3cb --- /dev/null +++ b/workspace/compile.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +tvmc compile --target "llvm -mcpu=core-avx2" --output resnet50-v2-7-tvm.tar resnet50-v2-7.onnx diff --git a/workspace/default_resnet50.py b/workspace/default_resnet50.py new file mode 100644 index 000000000000..8bdd1453ead8 --- /dev/null +++ b/workspace/default_resnet50.py @@ -0,0 +1,310 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling a Neural Network for NVIDIA GPU +=============================================== +**Author**: `Lianmin Zheng `_ + +Auto-tuning for specific devices and workloads is critical for getting the +best performance. This is a tutorial on how to tune a whole neural +network for NVIDIA GPU with the auto-scheduler. + +To auto-tune a neural network, we partition the network into small subgraphs and +tune them independently. Each subgraph is treated as one search task. +A task scheduler slices the time and dynamically allocates time resources to +these tasks. The task scheduler predicts the impact of each task on the end-to-end +execution time and prioritizes the one that can reduce the execution time the most. + +For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to +get the computational DAG in the tensor expression form. +We then use the auto-scheduler to construct a search space of this DAG and search +for good schedules (low-level optimizations). + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any +schedule templates. In other words, the auto-scheduler only uses the compute declarations +in :code:`tvm/python/topi` and does not use existing schedule templates. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import numpy as np + +import tvm +from tvm import relay, auto_scheduler +import tvm.relay.testing +from tvm.contrib import graph_executor + +################################################################# +# Define a Network +# ---------------- +# First, we need to define the network with relay frontend API. +# We can load some pre-defined network from :code:`tvm.relay.testing`. +# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow +# (see :ref:`front end tutorials`). +# +# For convolutional neural networks, although auto-scheduler can work correctly +# with any layout, we found the best performance is typically achieved with NHWC layout. +# We also implemented more optimizations for NHWC layout with the auto-scheduler. +# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. +# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. + + +def get_network(name, batch_size, layout="NHWC", dtype="float32"): + """Get the symbol definition and random weight of a network""" + + # auto-scheduler prefers NHWC layout + if layout == "NHWC": + image_shape = (224, 224, 3) + elif layout == "NCHW": + image_shape = (3, 224, 224) + else: + raise ValueError("Invalid layout: " + layout) + + input_shape = (batch_size,) + image_shape + output_shape = (batch_size, 1000) + + if name.startswith("resnet-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name.startswith("resnet3d-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "mobilenet": + mod, params = relay.testing.mobilenet.get_workload( + batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape + ) + elif name == "squeezenet_v1.1": + assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" + mod, params = relay.testing.squeezenet.get_workload( + version="1.1", + batch_size=batch_size, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "inception_v3": + input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) + mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) + elif name == "mxnet": + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + + assert layout == "NCHW" + + block = get_model("resnet18_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + net = mod["main"] + net = relay.Function( + net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + ) + mod = tvm.IRModule.from_expr(net) + + return mod, params, input_shape, output_shape + + +# Define the neural network and compilation target +network = "resnet-50" +batch_size = 1 +layout = "NHWC" +target = tvm.target.Target("cuda") +dtype = "float32" +log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) + +################################################################# +# Extract Search Tasks +# -------------------- +# Next, we extract the search tasks and their weights from a network. +# The weight of a task is the number of appearances of the task's subgraph +# in the whole network. +# By using the weight, we can approximate the end-to-end latency of the network +# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the +# latency of a task and :code:`weight[t]` is the weight of the task. +# The task scheduler will just optimize this objective. + +# Extract tasks from the network +print("Extract tasks...") +mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) +#tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) +# +#for idx, task in enumerate(tasks): +# print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) +# print(task.compute_dag) +# +################################################################# +# Begin Tuning +# ------------ +# Now, we set some options for tuning and launch the search tasks +# +# * :code:`measure_ctx` launches a different process for measurement to +# provide isolation. It can protect the main process from GPU crashes +# during measurement and avoid other runtime conflicts. +# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. +# This can warmup the GPU, which is necessary to get accurate measurement results. +# Typically, we recommend a value >= 300 ms. +# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. +# You can set it to a small number (e.g., 200) for a fast demonstrative run. +# In practice, we recommend setting it around :code:`900 * len(tasks)`, +# which is typically enough for the search to converge. +# For example, there are 24 tasks in resnet-18, so we can set it as 20000. +# You can adjust this parameter according to your time budget. +# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, +# The measurement records can be used to query the history best, resume the search, +# and do more analyses later. +# * see :any:`auto_scheduler.TuningOptions`, +# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. +# + + +def run_tuning(): + print("Begin tuning...") + measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10) + + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=48, # change this to 20000 to achieve the best performance + runner=measure_ctx.runner, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + ) + + tuner.tune(tune_option) + + +# We do not run the tuning in our webpage server since it takes too long. +# Uncomment the following line to run it by yourself. + +#run_tuning() + + +###################################################################### +# .. note:: Explain the printed information during tuning +# +# During the tuning, a lot of information will be printed on the console. +# They are used for debugging purposes. The most important info is the output +# of the task scheduler. The following table is a sample output. +# +# .. code-block:: c +# +# ---------------------------------------------------------------------- +# ------------------------------ [ Task Scheduler ] +# ---------------------------------------------------------------------- +# | ID | Latency (ms) | Speed (GFLOPS) | Trials | +# ------------------------------------------------- +# | 0 | 0.005 | 0.88 | 64 | +# | 1 | 0.010 | 99.10 | 64 | +# | 2 | 0.006 | 0.00 | 64 | +# | 3 | 0.145 | 979.78 | 384 | +# | 4 | 0.130 | 1097.02 | 384 | +# | 5 | 0.143 | 992.69 | 384 | +# | 6 | 0.076 | 1526.86 | 192 | +# | 7 | 0.115 | 999.44 | 320 | +# | 8 | 0.079 | 1449.39 | 320 | +# | 9 | 0.122 | 938.73 | 384 | +# | 10 | 0.063 | 1832.98 | 192 | +# | 11 | 0.072 | 1763.62 | 256 | +# | 12 | 0.062 | 2036.40 | 192 | +# | 13 | 0.068 | 1874.44 | 192 | +# | 14 | 0.049 | 2346.50 | 128 | +# | 15 | 0.076 | 1694.31 | 256 | +# | 16 | 0.067 | 1933.30 | 448 | +# | 17 | 0.076 | 1680.90 | 256 | +# | 18 | 0.022 | 98.43 | 64 | +# | 19 | 0.076 | 3112.55 | 192 | +# | 20 | 0.013 | 2026.44 | 64 | +# | 21 | 0.011 | 1136.69 | 64 | +# | 22 | 0.013 | 992.47 | 64 | +# | 23 | 0.020 | 627.56 | 64 | +# ------------------------------------------------- +# Estimated total latency: 1.587 ms Trials: 4992 Used time : 13296 s Next ID: 3 +# +# This table lists the latency and (estimated) speed of all tasks. +# It also lists the allocation of measurement trials for all tasks. +# The last line prints the total weighted latency of these tasks, +# which can be a rough estimation of the end-to-end execution time +# of the network. +# The last line also prints the total number of measurement trials, +# total time spent on auto-tuning and the id of the next task to tune. +# +# There will also be some "tvm::Error"s and CUDA errors, because the +# auto-scheduler will try some invalid schedules. +# You can safely ignore them if the tuning can continue, because these +# errors are isolated from the main process. +# + +###################################################################### +# .. note:: Terminate the tuning earlier +# +# You can terminate the tuning earlier by forcibly killing this process. +# As long as you get at least one valid schedule for each task in the log file, +# you should be able to do the compilation (the secion below). +# + + +################################################################# +# Compile and Evaluate +# -------------------- +# After auto-tuning, we can compile the network with the best schedules we found. +# All measurement records are dumped into the log file during auto-tuning, +# so we can read the log file and load the best schedules. + +# Compile with the history best +#print("Compile...") +#with auto_scheduler.ApplyHistoryBest(log_file): +# with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): +lib = relay.build(mod, target=target, params=params) + +# Create graph executor +dev = tvm.device(str(target), 7) +module = graph_executor.GraphModule(lib["default"](dev)) +data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) +module.set_input("data", data_tvm) + +# Evaluate +print("Evaluate inference time cost...") +print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) + + +################################################################# +# Other Tips +# ---------- +# 1. During the tuning, the auto-scheduler needs to compile many programs and +# extract feature from them. This part is CPU-intensive, +# so a high-performance CPU with many cores is recommended for faster search. +# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` +# to distill the large log file and only save the best useful records. +# 3. You can resume a search from the previous log file. You just need to +# add a new argument :code:`load_log_file` when creating the task scheduler +# in function :code:`run_tuning`. Say, +# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` +# 4. If you have multiple target GPUs, you can use all of them for measurements to +# parallelize the measurements. Check this :ref:`section ` +# to learn how to use the RPC Tracker and RPC Server. +# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` +# with :any:`auto_scheduler.RPCRunner`. diff --git a/workspace/tune_network_cuda.py b/workspace/tune_network_cuda.py new file mode 100644 index 000000000000..4a28a2ef9968 --- /dev/null +++ b/workspace/tune_network_cuda.py @@ -0,0 +1,310 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling a Neural Network for NVIDIA GPU +=============================================== +**Author**: `Lianmin Zheng `_ + +Auto-tuning for specific devices and workloads is critical for getting the +best performance. This is a tutorial on how to tune a whole neural +network for NVIDIA GPU with the auto-scheduler. + +To auto-tune a neural network, we partition the network into small subgraphs and +tune them independently. Each subgraph is treated as one search task. +A task scheduler slices the time and dynamically allocates time resources to +these tasks. The task scheduler predicts the impact of each task on the end-to-end +execution time and prioritizes the one that can reduce the execution time the most. + +For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to +get the computational DAG in the tensor expression form. +We then use the auto-scheduler to construct a search space of this DAG and search +for good schedules (low-level optimizations). + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any +schedule templates. In other words, the auto-scheduler only uses the compute declarations +in :code:`tvm/python/topi` and does not use existing schedule templates. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import numpy as np + +import tvm +from tvm import relay, auto_scheduler +import tvm.relay.testing +from tvm.contrib import graph_executor + +################################################################# +# Define a Network +# ---------------- +# First, we need to define the network with relay frontend API. +# We can load some pre-defined network from :code:`tvm.relay.testing`. +# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow +# (see :ref:`front end tutorials`). +# +# For convolutional neural networks, although auto-scheduler can work correctly +# with any layout, we found the best performance is typically achieved with NHWC layout. +# We also implemented more optimizations for NHWC layout with the auto-scheduler. +# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. +# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. + + +def get_network(name, batch_size, layout="NHWC", dtype="float32"): + """Get the symbol definition and random weight of a network""" + + # auto-scheduler prefers NHWC layout + if layout == "NHWC": + image_shape = (224, 224, 3) + elif layout == "NCHW": + image_shape = (3, 224, 224) + else: + raise ValueError("Invalid layout: " + layout) + + input_shape = (batch_size,) + image_shape + output_shape = (batch_size, 1000) + + if name.startswith("resnet-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name.startswith("resnet3d-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "mobilenet": + mod, params = relay.testing.mobilenet.get_workload( + batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape + ) + elif name == "squeezenet_v1.1": + assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" + mod, params = relay.testing.squeezenet.get_workload( + version="1.1", + batch_size=batch_size, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "inception_v3": + input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) + mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) + elif name == "mxnet": + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + + assert layout == "NCHW" + + block = get_model("resnet18_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + net = mod["main"] + net = relay.Function( + net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + ) + mod = tvm.IRModule.from_expr(net) + + return mod, params, input_shape, output_shape + + +# Define the neural network and compilation target +network = "resnet-50" +batch_size = 1 +layout = "NHWC" +target = tvm.target.Target("cuda") +dtype = "float32" +log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) + +################################################################# +# Extract Search Tasks +# -------------------- +# Next, we extract the search tasks and their weights from a network. +# The weight of a task is the number of appearances of the task's subgraph +# in the whole network. +# By using the weight, we can approximate the end-to-end latency of the network +# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the +# latency of a task and :code:`weight[t]` is the weight of the task. +# The task scheduler will just optimize this objective. + +# Extract tasks from the network +print("Extract tasks...") +mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) +tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) + +for idx, task in enumerate(tasks): + print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) + print(task.compute_dag) + +################################################################# +# Begin Tuning +# ------------ +# Now, we set some options for tuning and launch the search tasks +# +# * :code:`measure_ctx` launches a different process for measurement to +# provide isolation. It can protect the main process from GPU crashes +# during measurement and avoid other runtime conflicts. +# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. +# This can warmup the GPU, which is necessary to get accurate measurement results. +# Typically, we recommend a value >= 300 ms. +# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. +# You can set it to a small number (e.g., 200) for a fast demonstrative run. +# In practice, we recommend setting it around :code:`900 * len(tasks)`, +# which is typically enough for the search to converge. +# For example, there are 24 tasks in resnet-18, so we can set it as 20000. +# You can adjust this parameter according to your time budget. +# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, +# The measurement records can be used to query the history best, resume the search, +# and do more analyses later. +# * see :any:`auto_scheduler.TuningOptions`, +# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. +# + + +def run_tuning(): + print("Begin tuning...") + measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10) + + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=300, # change this to 20000 to achieve the best performance + runner=measure_ctx.runner, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + ) + + tuner.tune(tune_option) + + +# We do not run the tuning in our webpage server since it takes too long. +# Uncomment the following line to run it by yourself. + +run_tuning() + + +###################################################################### +# .. note:: Explain the printed information during tuning +# +# During the tuning, a lot of information will be printed on the console. +# They are used for debugging purposes. The most important info is the output +# of the task scheduler. The following table is a sample output. +# +# .. code-block:: c +# +# ---------------------------------------------------------------------- +# ------------------------------ [ Task Scheduler ] +# ---------------------------------------------------------------------- +# | ID | Latency (ms) | Speed (GFLOPS) | Trials | +# ------------------------------------------------- +# | 0 | 0.005 | 0.88 | 64 | +# | 1 | 0.010 | 99.10 | 64 | +# | 2 | 0.006 | 0.00 | 64 | +# | 3 | 0.145 | 979.78 | 384 | +# | 4 | 0.130 | 1097.02 | 384 | +# | 5 | 0.143 | 992.69 | 384 | +# | 6 | 0.076 | 1526.86 | 192 | +# | 7 | 0.115 | 999.44 | 320 | +# | 8 | 0.079 | 1449.39 | 320 | +# | 9 | 0.122 | 938.73 | 384 | +# | 10 | 0.063 | 1832.98 | 192 | +# | 11 | 0.072 | 1763.62 | 256 | +# | 12 | 0.062 | 2036.40 | 192 | +# | 13 | 0.068 | 1874.44 | 192 | +# | 14 | 0.049 | 2346.50 | 128 | +# | 15 | 0.076 | 1694.31 | 256 | +# | 16 | 0.067 | 1933.30 | 448 | +# | 17 | 0.076 | 1680.90 | 256 | +# | 18 | 0.022 | 98.43 | 64 | +# | 19 | 0.076 | 3112.55 | 192 | +# | 20 | 0.013 | 2026.44 | 64 | +# | 21 | 0.011 | 1136.69 | 64 | +# | 22 | 0.013 | 992.47 | 64 | +# | 23 | 0.020 | 627.56 | 64 | +# ------------------------------------------------- +# Estimated total latency: 1.587 ms Trials: 4992 Used time : 13296 s Next ID: 3 +# +# This table lists the latency and (estimated) speed of all tasks. +# It also lists the allocation of measurement trials for all tasks. +# The last line prints the total weighted latency of these tasks, +# which can be a rough estimation of the end-to-end execution time +# of the network. +# The last line also prints the total number of measurement trials, +# total time spent on auto-tuning and the id of the next task to tune. +# +# There will also be some "tvm::Error"s and CUDA errors, because the +# auto-scheduler will try some invalid schedules. +# You can safely ignore them if the tuning can continue, because these +# errors are isolated from the main process. +# + +###################################################################### +# .. note:: Terminate the tuning earlier +# +# You can terminate the tuning earlier by forcibly killing this process. +# As long as you get at least one valid schedule for each task in the log file, +# you should be able to do the compilation (the secion below). +# + + +################################################################# +# Compile and Evaluate +# -------------------- +# After auto-tuning, we can compile the network with the best schedules we found. +# All measurement records are dumped into the log file during auto-tuning, +# so we can read the log file and load the best schedules. + +# Compile with the history best +print("Compile...") +with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): + lib = relay.build(mod, target=target, params=params) + +# Create graph executor +dev = tvm.device(str(target), 7) +module = graph_executor.GraphModule(lib["default"](dev)) +data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) +module.set_input("data", data_tvm) + +# Evaluate +print("Evaluate inference time cost...") +print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) + + +################################################################# +# Other Tips +# ---------- +# 1. During the tuning, the auto-scheduler needs to compile many programs and +# extract feature from them. This part is CPU-intensive, +# so a high-performance CPU with many cores is recommended for faster search. +# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` +# to distill the large log file and only save the best useful records. +# 3. You can resume a search from the previous log file. You just need to +# add a new argument :code:`load_log_file` when creating the task scheduler +# in function :code:`run_tuning`. Say, +# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` +# 4. If you have multiple target GPUs, you can use all of them for measurements to +# parallelize the measurements. Check this :ref:`section ` +# to learn how to use the RPC Tracker and RPC Server. +# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` +# with :any:`auto_scheduler.RPCRunner`. From 7c3070a64981a79692d867bf6eed9c6ecb760ea9 Mon Sep 17 00:00:00 2001 From: CtfGo Date: Wed, 13 Jul 2022 03:35:14 +0000 Subject: [PATCH 3/4] update --- 3rdparty/cutlass | 2 +- 3rdparty/rang | 2 +- .../ablate.sh | 0 .../apply_tuned.py | 0 .../evaluate.sh | 0 .../print_best.py | 0 .../tune_network_cuda.py | 0 .../ablation_study_on_rule/no_schedule.py | 354 ------------------ 8 files changed, 2 insertions(+), 356 deletions(-) rename workspace/{ablation_study_on_rule => ablate_sketch_rule}/ablate.sh (100%) rename workspace/{ablation_study_on_rule => ablate_sketch_rule}/apply_tuned.py (100%) rename workspace/{ablation_study_on_rule => ablate_sketch_rule}/evaluate.sh (100%) rename workspace/{ablation_study_on_rule => ablate_sketch_rule}/print_best.py (100%) rename workspace/{ablation_study_on_rule => ablate_sketch_rule}/tune_network_cuda.py (100%) delete mode 100644 workspace/ablation_study_on_rule/no_schedule.py diff --git a/3rdparty/cutlass b/3rdparty/cutlass index c2ee13a0fe99..8a766804ad6f 160000 --- a/3rdparty/cutlass +++ b/3rdparty/cutlass @@ -1 +1 @@ -Subproject commit c2ee13a0fe99241b0e798ce647acf98e237f1d0c +Subproject commit 8a766804ad6ff14b1164fe922c6fe54c131bb02b diff --git a/3rdparty/rang b/3rdparty/rang index cabe04d6d6b0..22345aa4c468 160000 --- a/3rdparty/rang +++ b/3rdparty/rang @@ -1 +1 @@ -Subproject commit cabe04d6d6b05356fa8f9741704924788f0dd762 +Subproject commit 22345aa4c468db3bd4a0e64a47722aad3518cc81 diff --git a/workspace/ablation_study_on_rule/ablate.sh b/workspace/ablate_sketch_rule/ablate.sh similarity index 100% rename from workspace/ablation_study_on_rule/ablate.sh rename to workspace/ablate_sketch_rule/ablate.sh diff --git a/workspace/ablation_study_on_rule/apply_tuned.py b/workspace/ablate_sketch_rule/apply_tuned.py similarity index 100% rename from workspace/ablation_study_on_rule/apply_tuned.py rename to workspace/ablate_sketch_rule/apply_tuned.py diff --git a/workspace/ablation_study_on_rule/evaluate.sh b/workspace/ablate_sketch_rule/evaluate.sh similarity index 100% rename from workspace/ablation_study_on_rule/evaluate.sh rename to workspace/ablate_sketch_rule/evaluate.sh diff --git a/workspace/ablation_study_on_rule/print_best.py b/workspace/ablate_sketch_rule/print_best.py similarity index 100% rename from workspace/ablation_study_on_rule/print_best.py rename to workspace/ablate_sketch_rule/print_best.py diff --git a/workspace/ablation_study_on_rule/tune_network_cuda.py b/workspace/ablate_sketch_rule/tune_network_cuda.py similarity index 100% rename from workspace/ablation_study_on_rule/tune_network_cuda.py rename to workspace/ablate_sketch_rule/tune_network_cuda.py diff --git a/workspace/ablation_study_on_rule/no_schedule.py b/workspace/ablation_study_on_rule/no_schedule.py deleted file mode 100644 index 2007b863e367..000000000000 --- a/workspace/ablation_study_on_rule/no_schedule.py +++ /dev/null @@ -1,354 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-scheduling a Neural Network for NVIDIA GPU -=============================================== -**Author**: `Lianmin Zheng `_ - -Auto-tuning for specific devices and workloads is critical for getting the -best performance. This is a tutorial on how to tune a whole neural -network for NVIDIA GPU with the auto-scheduler. - -To auto-tune a neural network, we partition the network into small subgraphs and -tune them independently. Each subgraph is treated as one search task. -A task scheduler slices the time and dynamically allocates time resources to -these tasks. The task scheduler predicts the impact of each task on the end-to-end -execution time and prioritizes the one that can reduce the execution time the most. - -For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to -get the computational DAG in the tensor expression form. -We then use the auto-scheduler to construct a search space of this DAG and search -for good schedules (low-level optimizations). - -Different from the template-based :ref:`autotvm ` which relies on -manual templates to define the search space, the auto-scheduler does not require any -schedule templates. In other words, the auto-scheduler only uses the compute declarations -in :code:`tvm/python/topi` and does not use existing schedule templates. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -import numpy as np - -import tvm -from tvm import relay, auto_scheduler -import tvm.relay.testing -from tvm.contrib import graph_executor -import argparse -import os - -################################################################# -# Parse arguments - -def parse_args(): - parser = argparse.ArgumentParser("Tuning arguments") - parser.add_argument( - '-b', - '--batch_size', - type=int, - default=16, - help='batch size') - parser.add_argument( - '-d', - '--device_id', - type=int, - default=7, - help='device id to be used' - ) - parser.add_argument( - '-n', - '--num_measure_trials', - type=int, - default=300, - help='number of trials to be measured' - ) - parser.add_argument( - '--tuned_dir', - default='./result', - help='dirname of tuned result stored' - ) - parser.add_argument( - '-e', - '--ablated_rules', - action='append', - default=[], - help='names of rules to be ablated') - args = parser.parse_args() - return args - -args = parse_args() -print("Arguments: %s" % args) - -################################################################# -# Define a Network -# ---------------- -# First, we need to define the network with relay frontend API. -# We can load some pre-defined network from :code:`tvm.relay.testing`. -# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow -# (see :ref:`front end tutorials`). -# -# For convolutional neural networks, although auto-scheduler can work correctly -# with any layout, we found the best performance is typically achieved with NHWC layout. -# We also implemented more optimizations for NHWC layout with the auto-scheduler. -# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. -# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. - - -def get_network(name, batch_size, layout="NHWC", dtype="float32"): - """Get the symbol definition and random weight of a network""" - - # auto-scheduler prefers NHWC layout - if layout == "NHWC": - image_shape = (224, 224, 3) - elif layout == "NCHW": - image_shape = (3, 224, 224) - else: - raise ValueError("Invalid layout: " + layout) - - input_shape = (batch_size,) + image_shape - output_shape = (batch_size, 1000) - - if name.startswith("resnet-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name.startswith("resnet3d-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "mobilenet": - mod, params = relay.testing.mobilenet.get_workload( - batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape - ) - elif name == "squeezenet_v1.1": - assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" - mod, params = relay.testing.squeezenet.get_workload( - version="1.1", - batch_size=batch_size, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "inception_v3": - input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - elif name == "mxnet": - # an example for mxnet model - from mxnet.gluon.model_zoo.vision import get_model - - assert layout == "NCHW" - - block = get_model("resnet18_v1", pretrained=True) - mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) - net = mod["main"] - net = relay.Function( - net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs - ) - mod = tvm.IRModule.from_expr(net) - - return mod, params, input_shape, output_shape - - -# Define the neural network and compilation target -network = "resnet-50" -batch_size = args.batch_size -layout = "NHWC" -target = tvm.target.Target("cuda") -dtype = "float32" -log_name = "%s-%s-B%d-%s.disable-%s.json" % (network, layout, batch_size, target.kind.name, '_'.join(args.ablated_rules)) -log_file = os.path.join(args.tuned_dir, log_name) - -################################################################# -# Extract Search Tasks -# -------------------- -# Next, we extract the search tasks and their weights from a network. -# The weight of a task is the number of appearances of the task's subgraph -# in the whole network. -# By using the weight, we can approximate the end-to-end latency of the network -# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the -# latency of a task and :code:`weight[t]` is the weight of the task. -# The task scheduler will just optimize this objective. - -# Extract tasks from the network -print("Extract tasks...") -mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) -#tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) -# -#for idx, task in enumerate(tasks): -# print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) -# print(task.compute_dag) - -################################################################# -# Begin Tuning -# ------------ -# Now, we set some options for tuning and launch the search tasks -# -# * :code:`measure_ctx` launches a different process for measurement to -# provide isolation. It can protect the main process from GPU crashes -# during measurement and avoid other runtime conflicts. -# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. -# This can warmup the GPU, which is necessary to get accurate measurement results. -# Typically, we recommend a value >= 300 ms. -# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. -# You can set it to a small number (e.g., 200) for a fast demonstrative run. -# In practice, we recommend setting it around :code:`900 * len(tasks)`, -# which is typically enough for the search to converge. -# For example, there are 24 tasks in resnet-18, so we can set it as 20000. -# You can adjust this parameter according to your time budget. -# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, -# The measurement records can be used to query the history best, resume the search, -# and do more analyses later. -# * see :any:`auto_scheduler.TuningOptions`, -# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. -# - - -def run_tuning(): - print("Begin tuning...") - measure_ctx = auto_scheduler.LocalRPCMeasureContext(n_parallel=2, repeat=1, min_repeat_ms=300, timeout=10, device=args.device_id) - - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=args.num_measure_trials, # change this to 20000 to achieve the best performance - runner=measure_ctx.runner, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - ) - - tuner.tune(tune_option, search_policy_params={'ablated_rule_names' : args.ablated_rules}) - - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. - -#run_tuning() - - -###################################################################### -# .. note:: Explain the printed information during tuning -# -# During the tuning, a lot of information will be printed on the console. -# They are used for debugging purposes. The most important info is the output -# of the task scheduler. The following table is a sample output. -# -# .. code-block:: c -# -# ---------------------------------------------------------------------- -# ------------------------------ [ Task Scheduler ] -# ---------------------------------------------------------------------- -# | ID | Latency (ms) | Speed (GFLOPS) | Trials | -# ------------------------------------------------- -# | 0 | 0.005 | 0.88 | 64 | -# | 1 | 0.010 | 99.10 | 64 | -# | 2 | 0.006 | 0.00 | 64 | -# | 3 | 0.145 | 979.78 | 384 | -# | 4 | 0.130 | 1097.02 | 384 | -# | 5 | 0.143 | 992.69 | 384 | -# | 6 | 0.076 | 1526.86 | 192 | -# | 7 | 0.115 | 999.44 | 320 | -# | 8 | 0.079 | 1449.39 | 320 | -# | 9 | 0.122 | 938.73 | 384 | -# | 10 | 0.063 | 1832.98 | 192 | -# | 11 | 0.072 | 1763.62 | 256 | -# | 12 | 0.062 | 2036.40 | 192 | -# | 13 | 0.068 | 1874.44 | 192 | -# | 14 | 0.049 | 2346.50 | 128 | -# | 15 | 0.076 | 1694.31 | 256 | -# | 16 | 0.067 | 1933.30 | 448 | -# | 17 | 0.076 | 1680.90 | 256 | -# | 18 | 0.022 | 98.43 | 64 | -# | 19 | 0.076 | 3112.55 | 192 | -# | 20 | 0.013 | 2026.44 | 64 | -# | 21 | 0.011 | 1136.69 | 64 | -# | 22 | 0.013 | 992.47 | 64 | -# | 23 | 0.020 | 627.56 | 64 | -# ------------------------------------------------- -# Estimated total latency: 1.587 ms Trials: 4992 Used time : 13296 s Next ID: 3 -# -# This table lists the latency and (estimated) speed of all tasks. -# It also lists the allocation of measurement trials for all tasks. -# The last line prints the total weighted latency of these tasks, -# which can be a rough estimation of the end-to-end execution time -# of the network. -# The last line also prints the total number of measurement trials, -# total time spent on auto-tuning and the id of the next task to tune. -# -# There will also be some "tvm::Error"s and CUDA errors, because the -# auto-scheduler will try some invalid schedules. -# You can safely ignore them if the tuning can continue, because these -# errors are isolated from the main process. -# - -###################################################################### -# .. note:: Terminate the tuning earlier -# -# You can terminate the tuning earlier by forcibly killing this process. -# As long as you get at least one valid schedule for each task in the log file, -# you should be able to do the compilation (the secion below). -# - - -################################################################# -# Compile and Evaluate -# -------------------- -# After auto-tuning, we can compile the network with the best schedules we found. -# All measurement records are dumped into the log file during auto-tuning, -# so we can read the log file and load the best schedules. - -# Compile with the history best -print("Compile...") -with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): - lib = relay.build(mod, target=target, params=params) - -# Create graph executor -dev = tvm.device(str(target), args.device_id) -module = graph_executor.GraphModule(lib["default"](dev)) -data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) -module.set_input("data", data_tvm) - -# Evaluate -print("Evaluate inference time cost...") -print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) - - -################################################################# -# Other Tips -# ---------- -# 1. During the tuning, the auto-scheduler needs to compile many programs and -# extract feature from them. This part is CPU-intensive, -# so a high-performance CPU with many cores is recommended for faster search. -# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` -# to distill the large log file and only save the best useful records. -# 3. You can resume a search from the previous log file. You just need to -# add a new argument :code:`load_log_file` when creating the task scheduler -# in function :code:`run_tuning`. Say, -# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` -# 4. If you have multiple target GPUs, you can use all of them for measurements to -# parallelize the measurements. Check this :ref:`section ` -# to learn how to use the RPC Tracker and RPC Server. -# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` -# with :any:`auto_scheduler.RPCRunner`. From 4638ccfac9eea47f2426eb06aec8b502e484c91e Mon Sep 17 00:00:00 2001 From: CtfGo Date: Wed, 13 Jul 2022 04:59:22 +0000 Subject: [PATCH 4/4] remove redudant --- 3rdparty/dmlc-core | 2 +- workspace/compile.sh | 3 - workspace/default_resnet50.py | 310 --------------------------------- workspace/tune_network_cuda.py | 310 --------------------------------- 4 files changed, 1 insertion(+), 624 deletions(-) delete mode 100644 workspace/compile.sh delete mode 100644 workspace/default_resnet50.py delete mode 100644 workspace/tune_network_cuda.py diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 09511cf9fe5f..21cc7de0dc9f 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 09511cf9fe5ff103900a5eafb50870dc84cc17c8 +Subproject commit 21cc7de0dc9fd6acb796e1be6181fa8e6b6c8f41 diff --git a/workspace/compile.sh b/workspace/compile.sh deleted file mode 100644 index 82982219e3cb..000000000000 --- a/workspace/compile.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -tvmc compile --target "llvm -mcpu=core-avx2" --output resnet50-v2-7-tvm.tar resnet50-v2-7.onnx diff --git a/workspace/default_resnet50.py b/workspace/default_resnet50.py deleted file mode 100644 index 8bdd1453ead8..000000000000 --- a/workspace/default_resnet50.py +++ /dev/null @@ -1,310 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-scheduling a Neural Network for NVIDIA GPU -=============================================== -**Author**: `Lianmin Zheng `_ - -Auto-tuning for specific devices and workloads is critical for getting the -best performance. This is a tutorial on how to tune a whole neural -network for NVIDIA GPU with the auto-scheduler. - -To auto-tune a neural network, we partition the network into small subgraphs and -tune them independently. Each subgraph is treated as one search task. -A task scheduler slices the time and dynamically allocates time resources to -these tasks. The task scheduler predicts the impact of each task on the end-to-end -execution time and prioritizes the one that can reduce the execution time the most. - -For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to -get the computational DAG in the tensor expression form. -We then use the auto-scheduler to construct a search space of this DAG and search -for good schedules (low-level optimizations). - -Different from the template-based :ref:`autotvm ` which relies on -manual templates to define the search space, the auto-scheduler does not require any -schedule templates. In other words, the auto-scheduler only uses the compute declarations -in :code:`tvm/python/topi` and does not use existing schedule templates. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -import numpy as np - -import tvm -from tvm import relay, auto_scheduler -import tvm.relay.testing -from tvm.contrib import graph_executor - -################################################################# -# Define a Network -# ---------------- -# First, we need to define the network with relay frontend API. -# We can load some pre-defined network from :code:`tvm.relay.testing`. -# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow -# (see :ref:`front end tutorials`). -# -# For convolutional neural networks, although auto-scheduler can work correctly -# with any layout, we found the best performance is typically achieved with NHWC layout. -# We also implemented more optimizations for NHWC layout with the auto-scheduler. -# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. -# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. - - -def get_network(name, batch_size, layout="NHWC", dtype="float32"): - """Get the symbol definition and random weight of a network""" - - # auto-scheduler prefers NHWC layout - if layout == "NHWC": - image_shape = (224, 224, 3) - elif layout == "NCHW": - image_shape = (3, 224, 224) - else: - raise ValueError("Invalid layout: " + layout) - - input_shape = (batch_size,) + image_shape - output_shape = (batch_size, 1000) - - if name.startswith("resnet-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name.startswith("resnet3d-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "mobilenet": - mod, params = relay.testing.mobilenet.get_workload( - batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape - ) - elif name == "squeezenet_v1.1": - assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" - mod, params = relay.testing.squeezenet.get_workload( - version="1.1", - batch_size=batch_size, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "inception_v3": - input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - elif name == "mxnet": - # an example for mxnet model - from mxnet.gluon.model_zoo.vision import get_model - - assert layout == "NCHW" - - block = get_model("resnet18_v1", pretrained=True) - mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) - net = mod["main"] - net = relay.Function( - net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs - ) - mod = tvm.IRModule.from_expr(net) - - return mod, params, input_shape, output_shape - - -# Define the neural network and compilation target -network = "resnet-50" -batch_size = 1 -layout = "NHWC" -target = tvm.target.Target("cuda") -dtype = "float32" -log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) - -################################################################# -# Extract Search Tasks -# -------------------- -# Next, we extract the search tasks and their weights from a network. -# The weight of a task is the number of appearances of the task's subgraph -# in the whole network. -# By using the weight, we can approximate the end-to-end latency of the network -# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the -# latency of a task and :code:`weight[t]` is the weight of the task. -# The task scheduler will just optimize this objective. - -# Extract tasks from the network -print("Extract tasks...") -mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) -#tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) -# -#for idx, task in enumerate(tasks): -# print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) -# print(task.compute_dag) -# -################################################################# -# Begin Tuning -# ------------ -# Now, we set some options for tuning and launch the search tasks -# -# * :code:`measure_ctx` launches a different process for measurement to -# provide isolation. It can protect the main process from GPU crashes -# during measurement and avoid other runtime conflicts. -# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. -# This can warmup the GPU, which is necessary to get accurate measurement results. -# Typically, we recommend a value >= 300 ms. -# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. -# You can set it to a small number (e.g., 200) for a fast demonstrative run. -# In practice, we recommend setting it around :code:`900 * len(tasks)`, -# which is typically enough for the search to converge. -# For example, there are 24 tasks in resnet-18, so we can set it as 20000. -# You can adjust this parameter according to your time budget. -# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, -# The measurement records can be used to query the history best, resume the search, -# and do more analyses later. -# * see :any:`auto_scheduler.TuningOptions`, -# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. -# - - -def run_tuning(): - print("Begin tuning...") - measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10) - - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=48, # change this to 20000 to achieve the best performance - runner=measure_ctx.runner, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - ) - - tuner.tune(tune_option) - - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. - -#run_tuning() - - -###################################################################### -# .. note:: Explain the printed information during tuning -# -# During the tuning, a lot of information will be printed on the console. -# They are used for debugging purposes. The most important info is the output -# of the task scheduler. The following table is a sample output. -# -# .. code-block:: c -# -# ---------------------------------------------------------------------- -# ------------------------------ [ Task Scheduler ] -# ---------------------------------------------------------------------- -# | ID | Latency (ms) | Speed (GFLOPS) | Trials | -# ------------------------------------------------- -# | 0 | 0.005 | 0.88 | 64 | -# | 1 | 0.010 | 99.10 | 64 | -# | 2 | 0.006 | 0.00 | 64 | -# | 3 | 0.145 | 979.78 | 384 | -# | 4 | 0.130 | 1097.02 | 384 | -# | 5 | 0.143 | 992.69 | 384 | -# | 6 | 0.076 | 1526.86 | 192 | -# | 7 | 0.115 | 999.44 | 320 | -# | 8 | 0.079 | 1449.39 | 320 | -# | 9 | 0.122 | 938.73 | 384 | -# | 10 | 0.063 | 1832.98 | 192 | -# | 11 | 0.072 | 1763.62 | 256 | -# | 12 | 0.062 | 2036.40 | 192 | -# | 13 | 0.068 | 1874.44 | 192 | -# | 14 | 0.049 | 2346.50 | 128 | -# | 15 | 0.076 | 1694.31 | 256 | -# | 16 | 0.067 | 1933.30 | 448 | -# | 17 | 0.076 | 1680.90 | 256 | -# | 18 | 0.022 | 98.43 | 64 | -# | 19 | 0.076 | 3112.55 | 192 | -# | 20 | 0.013 | 2026.44 | 64 | -# | 21 | 0.011 | 1136.69 | 64 | -# | 22 | 0.013 | 992.47 | 64 | -# | 23 | 0.020 | 627.56 | 64 | -# ------------------------------------------------- -# Estimated total latency: 1.587 ms Trials: 4992 Used time : 13296 s Next ID: 3 -# -# This table lists the latency and (estimated) speed of all tasks. -# It also lists the allocation of measurement trials for all tasks. -# The last line prints the total weighted latency of these tasks, -# which can be a rough estimation of the end-to-end execution time -# of the network. -# The last line also prints the total number of measurement trials, -# total time spent on auto-tuning and the id of the next task to tune. -# -# There will also be some "tvm::Error"s and CUDA errors, because the -# auto-scheduler will try some invalid schedules. -# You can safely ignore them if the tuning can continue, because these -# errors are isolated from the main process. -# - -###################################################################### -# .. note:: Terminate the tuning earlier -# -# You can terminate the tuning earlier by forcibly killing this process. -# As long as you get at least one valid schedule for each task in the log file, -# you should be able to do the compilation (the secion below). -# - - -################################################################# -# Compile and Evaluate -# -------------------- -# After auto-tuning, we can compile the network with the best schedules we found. -# All measurement records are dumped into the log file during auto-tuning, -# so we can read the log file and load the best schedules. - -# Compile with the history best -#print("Compile...") -#with auto_scheduler.ApplyHistoryBest(log_file): -# with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): -lib = relay.build(mod, target=target, params=params) - -# Create graph executor -dev = tvm.device(str(target), 7) -module = graph_executor.GraphModule(lib["default"](dev)) -data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) -module.set_input("data", data_tvm) - -# Evaluate -print("Evaluate inference time cost...") -print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) - - -################################################################# -# Other Tips -# ---------- -# 1. During the tuning, the auto-scheduler needs to compile many programs and -# extract feature from them. This part is CPU-intensive, -# so a high-performance CPU with many cores is recommended for faster search. -# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` -# to distill the large log file and only save the best useful records. -# 3. You can resume a search from the previous log file. You just need to -# add a new argument :code:`load_log_file` when creating the task scheduler -# in function :code:`run_tuning`. Say, -# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` -# 4. If you have multiple target GPUs, you can use all of them for measurements to -# parallelize the measurements. Check this :ref:`section ` -# to learn how to use the RPC Tracker and RPC Server. -# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` -# with :any:`auto_scheduler.RPCRunner`. diff --git a/workspace/tune_network_cuda.py b/workspace/tune_network_cuda.py deleted file mode 100644 index 4a28a2ef9968..000000000000 --- a/workspace/tune_network_cuda.py +++ /dev/null @@ -1,310 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-scheduling a Neural Network for NVIDIA GPU -=============================================== -**Author**: `Lianmin Zheng `_ - -Auto-tuning for specific devices and workloads is critical for getting the -best performance. This is a tutorial on how to tune a whole neural -network for NVIDIA GPU with the auto-scheduler. - -To auto-tune a neural network, we partition the network into small subgraphs and -tune them independently. Each subgraph is treated as one search task. -A task scheduler slices the time and dynamically allocates time resources to -these tasks. The task scheduler predicts the impact of each task on the end-to-end -execution time and prioritizes the one that can reduce the execution time the most. - -For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to -get the computational DAG in the tensor expression form. -We then use the auto-scheduler to construct a search space of this DAG and search -for good schedules (low-level optimizations). - -Different from the template-based :ref:`autotvm ` which relies on -manual templates to define the search space, the auto-scheduler does not require any -schedule templates. In other words, the auto-scheduler only uses the compute declarations -in :code:`tvm/python/topi` and does not use existing schedule templates. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -import numpy as np - -import tvm -from tvm import relay, auto_scheduler -import tvm.relay.testing -from tvm.contrib import graph_executor - -################################################################# -# Define a Network -# ---------------- -# First, we need to define the network with relay frontend API. -# We can load some pre-defined network from :code:`tvm.relay.testing`. -# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow -# (see :ref:`front end tutorials`). -# -# For convolutional neural networks, although auto-scheduler can work correctly -# with any layout, we found the best performance is typically achieved with NHWC layout. -# We also implemented more optimizations for NHWC layout with the auto-scheduler. -# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. -# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. - - -def get_network(name, batch_size, layout="NHWC", dtype="float32"): - """Get the symbol definition and random weight of a network""" - - # auto-scheduler prefers NHWC layout - if layout == "NHWC": - image_shape = (224, 224, 3) - elif layout == "NCHW": - image_shape = (3, 224, 224) - else: - raise ValueError("Invalid layout: " + layout) - - input_shape = (batch_size,) + image_shape - output_shape = (batch_size, 1000) - - if name.startswith("resnet-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name.startswith("resnet3d-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "mobilenet": - mod, params = relay.testing.mobilenet.get_workload( - batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape - ) - elif name == "squeezenet_v1.1": - assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" - mod, params = relay.testing.squeezenet.get_workload( - version="1.1", - batch_size=batch_size, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "inception_v3": - input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - elif name == "mxnet": - # an example for mxnet model - from mxnet.gluon.model_zoo.vision import get_model - - assert layout == "NCHW" - - block = get_model("resnet18_v1", pretrained=True) - mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) - net = mod["main"] - net = relay.Function( - net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs - ) - mod = tvm.IRModule.from_expr(net) - - return mod, params, input_shape, output_shape - - -# Define the neural network and compilation target -network = "resnet-50" -batch_size = 1 -layout = "NHWC" -target = tvm.target.Target("cuda") -dtype = "float32" -log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) - -################################################################# -# Extract Search Tasks -# -------------------- -# Next, we extract the search tasks and their weights from a network. -# The weight of a task is the number of appearances of the task's subgraph -# in the whole network. -# By using the weight, we can approximate the end-to-end latency of the network -# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the -# latency of a task and :code:`weight[t]` is the weight of the task. -# The task scheduler will just optimize this objective. - -# Extract tasks from the network -print("Extract tasks...") -mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) -tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) - -for idx, task in enumerate(tasks): - print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) - print(task.compute_dag) - -################################################################# -# Begin Tuning -# ------------ -# Now, we set some options for tuning and launch the search tasks -# -# * :code:`measure_ctx` launches a different process for measurement to -# provide isolation. It can protect the main process from GPU crashes -# during measurement and avoid other runtime conflicts. -# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. -# This can warmup the GPU, which is necessary to get accurate measurement results. -# Typically, we recommend a value >= 300 ms. -# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. -# You can set it to a small number (e.g., 200) for a fast demonstrative run. -# In practice, we recommend setting it around :code:`900 * len(tasks)`, -# which is typically enough for the search to converge. -# For example, there are 24 tasks in resnet-18, so we can set it as 20000. -# You can adjust this parameter according to your time budget. -# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, -# The measurement records can be used to query the history best, resume the search, -# and do more analyses later. -# * see :any:`auto_scheduler.TuningOptions`, -# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. -# - - -def run_tuning(): - print("Begin tuning...") - measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10) - - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=300, # change this to 20000 to achieve the best performance - runner=measure_ctx.runner, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - ) - - tuner.tune(tune_option) - - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. - -run_tuning() - - -###################################################################### -# .. note:: Explain the printed information during tuning -# -# During the tuning, a lot of information will be printed on the console. -# They are used for debugging purposes. The most important info is the output -# of the task scheduler. The following table is a sample output. -# -# .. code-block:: c -# -# ---------------------------------------------------------------------- -# ------------------------------ [ Task Scheduler ] -# ---------------------------------------------------------------------- -# | ID | Latency (ms) | Speed (GFLOPS) | Trials | -# ------------------------------------------------- -# | 0 | 0.005 | 0.88 | 64 | -# | 1 | 0.010 | 99.10 | 64 | -# | 2 | 0.006 | 0.00 | 64 | -# | 3 | 0.145 | 979.78 | 384 | -# | 4 | 0.130 | 1097.02 | 384 | -# | 5 | 0.143 | 992.69 | 384 | -# | 6 | 0.076 | 1526.86 | 192 | -# | 7 | 0.115 | 999.44 | 320 | -# | 8 | 0.079 | 1449.39 | 320 | -# | 9 | 0.122 | 938.73 | 384 | -# | 10 | 0.063 | 1832.98 | 192 | -# | 11 | 0.072 | 1763.62 | 256 | -# | 12 | 0.062 | 2036.40 | 192 | -# | 13 | 0.068 | 1874.44 | 192 | -# | 14 | 0.049 | 2346.50 | 128 | -# | 15 | 0.076 | 1694.31 | 256 | -# | 16 | 0.067 | 1933.30 | 448 | -# | 17 | 0.076 | 1680.90 | 256 | -# | 18 | 0.022 | 98.43 | 64 | -# | 19 | 0.076 | 3112.55 | 192 | -# | 20 | 0.013 | 2026.44 | 64 | -# | 21 | 0.011 | 1136.69 | 64 | -# | 22 | 0.013 | 992.47 | 64 | -# | 23 | 0.020 | 627.56 | 64 | -# ------------------------------------------------- -# Estimated total latency: 1.587 ms Trials: 4992 Used time : 13296 s Next ID: 3 -# -# This table lists the latency and (estimated) speed of all tasks. -# It also lists the allocation of measurement trials for all tasks. -# The last line prints the total weighted latency of these tasks, -# which can be a rough estimation of the end-to-end execution time -# of the network. -# The last line also prints the total number of measurement trials, -# total time spent on auto-tuning and the id of the next task to tune. -# -# There will also be some "tvm::Error"s and CUDA errors, because the -# auto-scheduler will try some invalid schedules. -# You can safely ignore them if the tuning can continue, because these -# errors are isolated from the main process. -# - -###################################################################### -# .. note:: Terminate the tuning earlier -# -# You can terminate the tuning earlier by forcibly killing this process. -# As long as you get at least one valid schedule for each task in the log file, -# you should be able to do the compilation (the secion below). -# - - -################################################################# -# Compile and Evaluate -# -------------------- -# After auto-tuning, we can compile the network with the best schedules we found. -# All measurement records are dumped into the log file during auto-tuning, -# so we can read the log file and load the best schedules. - -# Compile with the history best -print("Compile...") -with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): - lib = relay.build(mod, target=target, params=params) - -# Create graph executor -dev = tvm.device(str(target), 7) -module = graph_executor.GraphModule(lib["default"](dev)) -data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) -module.set_input("data", data_tvm) - -# Evaluate -print("Evaluate inference time cost...") -print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) - - -################################################################# -# Other Tips -# ---------- -# 1. During the tuning, the auto-scheduler needs to compile many programs and -# extract feature from them. This part is CPU-intensive, -# so a high-performance CPU with many cores is recommended for faster search. -# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` -# to distill the large log file and only save the best useful records. -# 3. You can resume a search from the previous log file. You just need to -# add a new argument :code:`load_log_file` when creating the task scheduler -# in function :code:`run_tuning`. Say, -# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` -# 4. If you have multiple target GPUs, you can use all of them for measurements to -# parallelize the measurements. Check this :ref:`section ` -# to learn how to use the RPC Tracker and RPC Server. -# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` -# with :any:`auto_scheduler.RPCRunner`.