Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/cutlass
Submodule cutlass updated 1097 files
1 change: 1 addition & 0 deletions python/tvm/auto_scheduler/search_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ class SketchPolicy(SearchPolicy):
"max_innermost_split_factor": 64,
"max_vectorize_size": 16,
"disable_change_compute_location": 0,
"ablated_rule_names" : ["RuleCrossThreadReduction", "MutateAutoUnroll"],
}

def __init__(
Expand Down
44 changes: 44 additions & 0 deletions src/auto_scheduler/search_policy/sketch_policy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
node->verbose = verbose;
node->sample_init_min_pop_ =
GetIntParam(node->params, SketchParamKey::SampleInitPopulation::min_population);
auto ablated_rules = GetIterNameSetParam(node->params, SketchParamKey::ablated_rule_names);

if (init_search_callbacks) {
PrintTitle("Call init-search callbacks", verbose);
Expand Down Expand Up @@ -153,6 +154,49 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
LOG(FATAL) << "No default sketch rules for target: " << task->target;
}

// ablating specified rules to measure their impacts
int vp = 0;
for (auto* rule : node->sketch_rules) {
if (ablated_rules.count(rule->GetRuleName())) {
StdCout(verbose) << "Albating sketch rule: " << rule->GetRuleName() << std::endl;
} else {
node->sketch_rules[vp++] = rule;
StdCout(verbose) << "Enable sketch rule: " << rule->GetRuleName() << std::endl;
}
}
if (vp < node->sketch_rules.size()) {
node->sketch_rules.erase(node->sketch_rules.begin() + vp, node->sketch_rules.end());
StdCout(verbose) << "Sketch rule size: " << node->sketch_rules.size();
}

vp = 0;
for (auto* rule : node->init_rules) {
if (ablated_rules.count(rule->GetRuleName())) {
StdCout(verbose) << "Albating init rule: " << rule->GetRuleName() << std::endl;
} else {
node->init_rules[vp++] = rule;
StdCout(verbose) << "Enable init rule: " << rule->GetRuleName() << std::endl;
}
}
if (vp < node->init_rules.size()) {
node->init_rules.erase(node->init_rules.begin() + vp, node->init_rules.end());
StdCout(verbose) << "Init rule size: " << node->init_rules.size();
}

vp = 0;
for (auto rule : node->mutation_rules) {
if (ablated_rules.count(rule->GetRuleName())) {
StdCout(verbose) << "Albating mutation rule: " << rule->GetRuleName() << std::endl;
} else {
node->mutation_rules[vp++] = rule;
StdCout(verbose) << "Enable mutation rule: " << rule->GetRuleName() << std::endl;
}
}
if (vp < node->mutation_rules.size()) {
node->mutation_rules.erase(node->mutation_rules.begin() + vp, node->mutation_rules.end());
StdCout(verbose) << "Mutation rule size: " << node->mutation_rules.size();
}

data_ = std::move(node);
}

Expand Down
2 changes: 2 additions & 0 deletions src/auto_scheduler/search_policy/sketch_policy.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ struct SketchParamKey {
static constexpr const char* max_vectorize_size = "max_vectorize_size";
/*! \brief Whether disable compute location changing. */
static constexpr const char* disable_change_compute_location = "disable_change_compute_location";
/*! \brief The list of rules to be ablated */
static constexpr const char* ablated_rule_names = "ablated_rule_names";
};

class SketchPolicy;
Expand Down
7 changes: 7 additions & 0 deletions src/auto_scheduler/search_policy/sketch_policy_rules.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ class PopulationGenerationRule {
*/
virtual ResultKind Apply(SketchPolicyNode* policy, State* state,
std::mt19937* rand_gen) const = 0;
/*!
* \brief Get the name of this rule.
* \return A string of the rule name.
*/
virtual std::string GetRuleName() const = 0;

/*! \brief The deconstructor */
virtual ~PopulationGenerationRule() = default;
Expand All @@ -181,6 +186,7 @@ class PopulationGenerationRule {
class rule_name : public PopulationGenerationRule { \
public: \
ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \
std::string GetRuleName() const final { return #rule_name; } \
};

/*! \brief The rule that fills the incomplete SplitSteps. */
Expand Down Expand Up @@ -223,6 +229,7 @@ class PopulationMutationRule : public PopulationGenerationRule {
public: \
explicit rule_name(double weight) : PopulationMutationRule(weight) {} \
ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \
std::string GetRuleName() const final { return #rule_name; } \
};

/*! \brief The rule that mutates tile size by randomly dividing a tile size by a factor
Expand Down
24 changes: 24 additions & 0 deletions workspace/ablate_sketch_rule/ablate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

#ablated_rules="RuleAddCacheRead RuleSpecialComputeLocationGPU RuleAlwaysInline RuleSimplifyComputeWithConstTensor RuleCrossThreadReduction RuleAddCacheWrite RuleMultiLevelTilingWithFusion RuleMultiLevelTiling InitFillTileSize InitThreadBind InitUnroll MutateTileSize MutateAutoUnroll"
ablated_rules="RuleAddCacheWrite RuleMultiLevelTilingWithFusion InitFillTileSize"

cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"'
#echo "Default Tuning with batch_size=64 at: "$(eval $cur_time)
#python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0615-bs64 > ./log/0615-bs64/default.log 2>&1

echo "Begin ablating rules with bs=16 at: "$(eval $cur_time)
for rule in $ablated_rules; do
log_file=./log/0620-pair/bs16-disable-RuleCrossThreadReduction-$rule.log
echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file"
python -u tune_network_cuda.py -b 16 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1
echo "End at: $(eval $cur_time)"
done

echo "Begin ablating rules with bs=64 at: "$(eval $cur_time)
for rule in $ablated_rules; do
log_file=./log/0620-pair/bs64-disable-RuleCrossThreadReduction-$rule.log
echo "Start test at:$(eval $cur_time), rule: $rule, log: $log_file"
python -u tune_network_cuda.py -b 64 -d 7 -n 300 --tuned_dir ./result/0620-pair -e RuleCrossThreadReduction -e $rule > $log_file 2>&1
echo "End at: $(eval $cur_time)"
done
200 changes: 200 additions & 0 deletions workspace/ablate_sketch_rule/apply_tuned.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Auto-scheduling a Neural Network for NVIDIA GPU
===============================================
**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_

Auto-tuning for specific devices and workloads is critical for getting the
best performance. This is a tutorial on how to tune a whole neural
network for NVIDIA GPU with the auto-scheduler.

To auto-tune a neural network, we partition the network into small subgraphs and
tune them independently. Each subgraph is treated as one search task.
A task scheduler slices the time and dynamically allocates time resources to
these tasks. The task scheduler predicts the impact of each task on the end-to-end
execution time and prioritizes the one that can reduce the execution time the most.

For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
get the computational DAG in the tensor expression form.
We then use the auto-scheduler to construct a search space of this DAG and search
for good schedules (low-level optimizations).

Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
manual templates to define the search space, the auto-scheduler does not require any
schedule templates. In other words, the auto-scheduler only uses the compute declarations
in :code:`tvm/python/topi` and does not use existing schedule templates.

Note that this tutorial will not run on Windows or recent versions of macOS. To
get it to run, you will need to wrap the body of this tutorial in a :code:`if
__name__ == "__main__":` block.
"""

import numpy as np

import tvm
from tvm import relay, auto_scheduler
import tvm.relay.testing
from tvm.contrib import graph_executor
import argparse
import os

#################################################################
# Parse arguments

def parse_args():
parser = argparse.ArgumentParser("Evaluate tuned result")
parser.add_argument(
'-b',
'--batch_size',
type=int,
default=16,
help='batch size')
parser.add_argument(
'-d',
'--device_id',
type=int,
default=7,
help='device id to be used'
)
parser.add_argument(
'--tuned_dir',
default='./result',
help='dirname of tuned result stored'
)
args = parser.parse_args()
return args

args = parse_args()
print("Arguments: %s" % args)

#################################################################
# Define a Network
# ----------------
# First, we need to define the network with relay frontend API.
# We can load some pre-defined network from :code:`tvm.relay.testing`.
# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
# (see :ref:`front end tutorials<tutorial-frontend>`).
#
# For convolutional neural networks, although auto-scheduler can work correctly
# with any layout, we found the best performance is typically achieved with NHWC layout.
# We also implemented more optimizations for NHWC layout with the auto-scheduler.
# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.


def get_network(name, batch_size, layout="NHWC", dtype="float32"):
"""Get the symbol definition and random weight of a network"""

# auto-scheduler prefers NHWC layout
if layout == "NHWC":
image_shape = (224, 224, 3)
elif layout == "NCHW":
image_shape = (3, 224, 224)
else:
raise ValueError("Invalid layout: " + layout)

input_shape = (batch_size,) + image_shape
output_shape = (batch_size, 1000)

if name.startswith("resnet-"):
n_layer = int(name.split("-")[1])
mod, params = relay.testing.resnet.get_workload(
num_layers=n_layer,
batch_size=batch_size,
layout=layout,
dtype=dtype,
image_shape=image_shape,
)
elif name.startswith("resnet3d-"):
n_layer = int(name.split("-")[1])
mod, params = relay.testing.resnet.get_workload(
num_layers=n_layer,
batch_size=batch_size,
layout=layout,
dtype=dtype,
image_shape=image_shape,
)
elif name == "mobilenet":
mod, params = relay.testing.mobilenet.get_workload(
batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
)
elif name == "squeezenet_v1.1":
assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
mod, params = relay.testing.squeezenet.get_workload(
version="1.1",
batch_size=batch_size,
dtype=dtype,
image_shape=image_shape,
)
elif name == "inception_v3":
input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
elif name == "mxnet":
# an example for mxnet model
from mxnet.gluon.model_zoo.vision import get_model

assert layout == "NCHW"

block = get_model("resnet18_v1", pretrained=True)
mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
net = mod["main"]
net = relay.Function(
net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
)
mod = tvm.IRModule.from_expr(net)

return mod, params, input_shape, output_shape


# Define the neural network and compilation target
network = "resnet-50"
batch_size = args.batch_size
layout = "NHWC"
target = tvm.target.Target("cuda")
dtype = "float32"

# Get network
mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)

#################################################################
# Compile and Evaluate
# --------------------
# After auto-tuning, we can compile the network with the best schedules we found.
# All measurement records are dumped into the log file during auto-tuning,
# so we can read the log file and load the best schedules.

# Compile with the history best
def apply_tuned(log_file):
print("Apply: %s" % log_file)
with auto_scheduler.ApplyHistoryBest(log_file):
with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
lib = relay.build(mod, target=target, params=params)

# Create graph executor
dev = tvm.device(str(target), args.device_id)
module = graph_executor.GraphModule(lib["default"](dev))
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
module.set_input("data", data_tvm)

# Evaluate
print(module.benchmark(dev, repeat=3, min_repeat_ms=500))

for root, dirs, files in os.walk(args.tuned_dir):
for file_name in files:
log_file = os.path.join(root, file_name)
apply_tuned(log_file)
11 changes: 11 additions & 0 deletions workspace/ablate_sketch_rule/evaluate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

cur_time='TZ=UTC-8 date +"%Y-%m-%d %H:%M:%S"'
echo "Begin evaluationg on: "$(eval $cur_time)
#python -u no_schedule.py -b 16 -d 7
#python -u apply_tuned.py -b 64 -d 1 --tuned_dir ./result/0615-bs64
log_file=./log/bs64-default.debug
python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-.json > $log_file 2>&1 &
#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-InitThreadBind.json > $log_file 2>&1 &
#python -u print_best.py -b 64 -d 1 --tuned_dir ./result/0615-bs64/resnet-50-NHWC-B64-cuda.disable-MutateAutoUnroll.json > $log_file 2>&1 &
echo "End at: $(eval $cur_time)"
Loading