diff --git a/CMakeLists.txt b/CMakeLists.txt index 8995f9a87fb7..9568e95268c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -293,6 +293,19 @@ tvm_file_glob(GLOB CODEGEN_SRCS list(APPEND COMPILER_SRCS ${CODEGEN_SRCS}) +file(GLOB DMLC_SRC + 3rdparty/dmlc-core/src/*.cc + 3rdparty/dmlc-core/src/io/filesys.cc + 3rdparty/dmlc-core/src/io/indexed_recordio_split.cc + 3rdparty/dmlc-core/src/io/input_split_base.cc + 3rdparty/dmlc-core/src/io/line_split.cc + 3rdparty/dmlc-core/src/io/local_filesys.cc + 3rdparty/dmlc-core/src/io/recordio_split.cc + ) +list(APPEND COMPILER_SRCS ${DMLC_SRC}) + + + tvm_file_glob(GLOB_RECURSE RELAY_OP_SRCS src/relay/op/*.cc ) diff --git a/include/tvm/auto_scheduler/measure.h b/include/tvm/auto_scheduler/measure.h index 8576468816cb..03169cf4c47e 100755 --- a/include/tvm/auto_scheduler/measure.h +++ b/include/tvm/auto_scheduler/measure.h @@ -38,6 +38,8 @@ #ifndef TVM_AUTO_SCHEDULER_MEASURE_H_ #define TVM_AUTO_SCHEDULER_MEASURE_H_ +#include + #include #include @@ -322,6 +324,10 @@ class ProgramRunnerNode : public Object { virtual Array Run(const Array& inputs, const Array& build_results, int verbose) = 0; + + virtual Array GetOutput(const Array& inputs, + const Array& build_results, int verbose) = 0; + static constexpr const char* _type_key = "auto_scheduler.ProgramRunner"; TVM_DECLARE_BASE_OBJECT_INFO(ProgramRunnerNode, Object); }; @@ -373,6 +379,10 @@ class LocalRunnerNode : public ProgramRunnerNode { Array Run(const Array& inputs, const Array& build_results, int verbose) final; + Array GetOutput(const Array& inputs, + const Array& build_results, int verbose) final; + + static constexpr const char* _type_key = "auto_scheduler.LocalRunner"; TVM_DECLARE_FINAL_OBJECT_INFO(LocalRunnerNode, ProgramRunnerNode); }; @@ -422,6 +432,10 @@ class RPCRunnerNode : public ProgramRunnerNode { Array Run(const Array& inputs, const Array& build_results, int verbose) final; + virtual Array GetOutput(const Array& inputs, + const Array& build_results, int verbose) final; + + static constexpr const char* _type_key = "auto_scheduler.RPCRunner"; TVM_DECLARE_FINAL_OBJECT_INFO(RPCRunnerNode, ProgramRunnerNode); }; diff --git a/include/tvm/auto_scheduler/search_task.h b/include/tvm/auto_scheduler/search_task.h index efbc2529592b..4f7f05440f01 100755 --- a/include/tvm/auto_scheduler/search_task.h +++ b/include/tvm/auto_scheduler/search_task.h @@ -125,6 +125,12 @@ class SearchTaskNode : public Object { LayoutRewriteOption layout_rewrite_option; /*! \brief Names of some user defined input data used in program measuring. */ Array task_input_names; + /*! \brief keeping custom seed to reproduce randomly generated input values */ + int custom_seed; + + /*! \brief ref value for output values */ + Array ref_output_tensors; + void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("compute_dag", &compute_dag); @@ -135,8 +141,14 @@ class SearchTaskNode : public Object { v->Visit("hardware_params", &hardware_params); v->Visit("layout_rewrite_option", &layout_rewrite_option); v->Visit("task_input_names", &task_input_names); + v->Visit("custom_seed",&custom_seed); + v->Visit("ref_output_tensors",&ref_output_tensors); } + void SetReferenceTensors(Array arr); + + void SetTarget(Target target, Target target_host); + static constexpr const char* _type_key = "auto_scheduler.SearchTask"; TVM_DECLARE_FINAL_OBJECT_INFO(SearchTaskNode, Object); }; @@ -160,7 +172,10 @@ class SearchTask : public ObjectRef { */ SearchTask(ComputeDAG compute_dag, String workload_key, Target target, Target target_host, Optional hardware_params, LayoutRewriteOption layout_rewrite_option, - Array task_input_names, String desc = ""); + Array task_input_names, + String desc = "", + Array ref_output_tensors = Array(), + int custom_seed = 42); TVM_DEFINE_OBJECT_REF_METHODS(SearchTask, ObjectRef, SearchTaskNode); }; diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py index 6f331499b042..37b24dec30be 100644 --- a/python/tvm/auto_scheduler/measure.py +++ b/python/tvm/auto_scheduler/measure.py @@ -46,6 +46,7 @@ from tvm.ir import transform from tvm.runtime import Object, module, ndarray from tvm.target import Target +import numpy as np from . import _ffi_api from .loop_state import StateObject @@ -230,6 +231,7 @@ def recover_measure_input(inp, rebuild_state=False): hardware_params=task.hardware_params, layout_rewrite_option=task.layout_rewrite_option, task_inputs=list(task.task_input_names), + ref_output_tensors=task.ref_output_tensors, ) if rebuild_state: @@ -283,6 +285,10 @@ def run(self, measure_inputs, build_results, verbose=1): """ return _ffi_api.ProgramRunnerRun(self, measure_inputs, build_results, verbose) + def get_ouput(self, measure_inputs, build_results, verbose=1): + + return _ffi_api.ProgramRunnerGetOutput(self, measure_inputs, build_results, verbose) + @tvm._ffi.register_object("auto_scheduler.ProgramMeasurer") class ProgramMeasurer(Object): @@ -630,7 +636,7 @@ def _local_build_worker(inp_serialized, build_func, verbose): try: with transform.PassContext().current(): - func = build_module.build(sch, args, target=task.target) + func = build_module.build(sch, args, target=task.target, target_host=task.target_host) func.export_library(filename, build_func) # pylint: disable=broad-except except Exception: @@ -920,7 +926,7 @@ def _timed_eval_func( empty_array = ndarray.empty( get_const_tuple(build_res_arg.shape), build_res_arg.dtype, dev ) - random_fill(empty_array) + random_fill(empty_array, inp.task.custom_seed) loc_args.append(empty_array) else: loc_args.append(ndarray.array(args[idx], dev)) @@ -944,6 +950,90 @@ def _timed_eval_func( return costs, error_no, error_msg, toc - tic + build_res.time_cost, toc +def _get_output_func( + inp_serialized, + build_res, + args, + number, + repeat, + min_repeat_ms, + cooldown_interval, + enable_cpu_cache_flush, + verbose, +): + + inp = MeasureInput.deserialize(inp_serialized) + tic = time.time() + error_no = 0 + error_msg = None + try: + func = module.load_module(build_res.filename) + dev = ndarray.device(str(inp.task.target), 0) + + f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else "" + time_f = func.time_evaluator( + func.entry_name, + dev, + number=number, + repeat=repeat, + min_repeat_ms=min_repeat_ms, + f_preproc=f_prepare, + ) + + + # pylint: disable=broad-except + except Exception: + costs = (MAX_FLOAT,) + error_no = MeasureErrorNo.COMPILE_DEVICE + error_msg = make_traceback_info() + + result = [] + if error_no == 0: + try: + random_fill = tvm.get_global_func("tvm.contrib.random.random_fill", True) + assert random_fill, "Please make sure USE_RANDOM is ON in the config.cmake" + assert len(args) == len(build_res.args) + + loc_args = [] + + # pylint: disable=consider-using-enumerate + for idx in range(len(args)): + if args[idx] is None: + build_res_arg = build_res.args[idx] + empty_array = ndarray.empty( + get_const_tuple(build_res_arg.shape), build_res_arg.dtype, dev + ) + random_fill(empty_array, inp.task.custom_seed) + loc_args.append(empty_array) + else: + loc_args.append(ndarray.array(args[idx], dev)) + dev.sync() + costs = time_f(*loc_args).results + + idx = len(loc_args) - 1 + arr = ndarray.array(loc_args[len(loc_args) - 1], dev) + result.append(arr.numpy()) + + + # pylint: disable=broad-except + except Exception: + costs = (MAX_FLOAT,) + error_no = MeasureErrorNo.RUNTIME_DEVICE + error_msg = make_traceback_info() + + shutil.rmtree(os.path.dirname(build_res.filename)) + toc = time.time() + time.sleep(cooldown_interval) + + if verbose >= 1: + if error_no == MeasureErrorNo.NO_ERROR: + print("*", end="", flush=True) + else: + print("*E", end="", flush=True) # Run error + + return result + + @tvm._ffi.register_func("auto_scheduler.local_runner.run") def local_run( inputs, @@ -1063,6 +1153,58 @@ def local_run( return measure_results +@tvm._ffi.register_func("auto_scheduler.local_runner.get_output") +def local_get_ouput( + inputs, + build_results, + timeout=10, + number=3, + repeat=1, + min_repeat_ms=0, + cooldown_interval=0, + enable_cpu_cache_flush=False, + verbose=1, +): + + measure_results = [] + assert len(inputs) == len(build_results), "Measure input size should be equal to build results" + worker = PopenWorker() + for inp, build_res in zip(inputs, build_results): + if build_res.error_no != 0: + res = None + if verbose >= 1: + print("*B", end="", flush=True) # Build error + else: + args = prepare_runner_args(inp, build_res) + res = call_func_with_timeout( + worker, + timeout, + _get_output_func, + args=( + inp.serialize(), + build_res, + args, + number, + repeat, + min_repeat_ms, + cooldown_interval, + enable_cpu_cache_flush, + verbose, + ), + ) + if isinstance(res, TimeoutError): + if verbose >= 1: + print("*T", end="", flush=True) # Run timeout + elif isinstance(res, Exception): + if verbose >= 1: + print("*E", end="", flush=True) # Run error + + measure_results.append(*res) + + if verbose >= 1: + print("", flush=True) + + return [tvm.nd.array(x) for x in measure_results] def _rpc_run( inp_serialized, @@ -1129,7 +1271,7 @@ def _rpc_run( empty_array = ndarray.empty( get_const_tuple(build_res_arg.shape), build_res_arg.dtype, dev ) - random_fill(empty_array) + random_fill(empty_array, inp.task.custom_seed) loc_args.append(empty_array) else: loc_args.append(ndarray.array(args[idx], dev)) @@ -1139,6 +1281,17 @@ def _rpc_run( func.entry_func(*loc_args) dev.sync() + # check vs ref values + arr = ndarray.array(loc_args[len(loc_args) - 1], dev).numpy() + ref = inp.task.ref_output_tensors[0].numpy() + diff = np.abs(arr - ref) + if (diff <= 1e-3).all() == False: + print(f'\nAccuracy verification: FAILED\nmaximum element difference: {np.amax(diff)}, l2 diff: {np.linalg.norm(diff)}') + raise ValueError("Accuracy verification: FAILED ") + else: + print(f'\nAccuracy verification: PASSED\nmaximum element difference: {np.amax(diff)}, l2 diff: {np.linalg.norm(diff)}') + + costs = time_f(*loc_args).results # clean up remote files diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py index 9541232a6a38..9a512f061247 100644 --- a/python/tvm/auto_scheduler/relay_integration.py +++ b/python/tvm/auto_scheduler/relay_integration.py @@ -144,6 +144,11 @@ def extract_tasks( # create search tasks tasks = [] weights = [] + + #faked ref to extract task + #TODO: without it, initial serialization crashes... how to solve it more elegantly? + ref = [tvm.nd.empty((1, 1))] + for wkl_key, (weight, func_names) in env.wkl_key_to_weight.items(): tasks.append( SearchTask( @@ -160,6 +165,7 @@ def extract_tasks( ), task_inputs_save_to_file=True, desc=",".join(func_names), + ref_output_tensors=ref, ) ) weights.append(int(weight)) diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py index ab03ff9f8eff..d1f488890654 100644 --- a/python/tvm/auto_scheduler/search_task.py +++ b/python/tvm/auto_scheduler/search_task.py @@ -436,6 +436,8 @@ def __init__( task_inputs_overwrite=False, task_inputs_save_to_file=False, desc="", + ref_output_tensors=None, + custom_seed = 42, ): assert ( func is not None or workload_key is not None @@ -479,6 +481,8 @@ def __init__( layout_rewrite_option, task_input_names, desc, + ref_output_tensors, + custom_seed, ) def tune(self, tuning_options, search_policy=None, adaptive_training=False): @@ -598,6 +602,8 @@ def __setstate__(self, state): state["layout_rewrite_option"], state["task_input_names"], state["desc"], + state["ref_ouput_tensors"], + state["custom_seed"], ) diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py index c23c9b3c0c2b..524058309829 100644 --- a/python/tvm/auto_scheduler/task_scheduler.py +++ b/python/tvm/auto_scheduler/task_scheduler.py @@ -32,10 +32,13 @@ from .search_policy import SearchPolicy, SketchPolicy, PreloadMeasuredStates from .cost_model import RandomModel, XGBModel from .utils import array_mean -from .measure import ProgramMeasurer +from .measure import ProgramMeasurer, LocalBuilder, LocalRunner, MeasureInput, MeasureResult from .measure_record import RecordReader from . import _ffi_api + +import tvm + logger = logging.getLogger("auto_scheduler") @@ -278,6 +281,47 @@ def __init__( self.group_task_ids.append([]) self.group_task_ids[self.tag_to_group_id[tag]].append(i) + + def calc_ref_tensors(self): + + try: + # calc reference tensors + local_builder = LocalBuilder() + local_runner = LocalRunner() + + + for idx, task in enumerate(self.tasks): + print("========== Task %d (workload key: %s) ==========" % + (idx, task.workload_key)) + print(task.compute_dag) + + state = task.compute_dag.get_init_state() + original_target = task.target + original_target_host = task.target_host + + ref_target = tvm.target.Target("llvm", host="llvm") + _ffi_api.SetTarget(task, ref_target, None) + measure_inputs = [MeasureInput(task, state)] + + print("Collecting reference output tensors:") + # build in silent mode + build_results = local_builder.build(measure_inputs, verbose=0) + + results = local_runner.get_ouput(measure_inputs, build_results) + + _ffi_api.SetReferenceTensors(task, results) + _ffi_api.SetTarget(task, original_target, original_target_host) + + print("DONE") + + # pylint: disable=broad-except + except Exception: + error_msg = make_traceback_info() + print(error_msg) + + + + def tune( self, tune_option, diff --git a/src/auto_scheduler/measure.cc b/src/auto_scheduler/measure.cc index abb77581e7ee..a705262b1720 100755 --- a/src/auto_scheduler/measure.cc +++ b/src/auto_scheduler/measure.cc @@ -153,6 +153,27 @@ Array LocalRunnerNode::Run(const Array& inputs, throw; } +Array LocalRunnerNode::GetOutput(const Array& inputs, + const Array& build_results, int verbose) { + + if (const auto* f = runtime::Registry::Get("auto_scheduler.local_runner.get_output")) { + + + Array results = + (*f)(inputs, build_results, timeout, number, repeat, min_repeat_ms, cooldown_interval, + enable_cpu_cache_flush, verbose); + + + return results; + + } + LOG(FATAL) << "auto_scheduler.local_runner.get_output is not registered. " + << "This is a function registered in Python, " + << "make sure the TVM Python runtime has been loaded successfully."; + throw; +} + + /********** RPCRunner **********/ RPCRunner::RPCRunner(const String& key, const String& host, int port, int priority, int n_parallel, int timeout, int number, int repeat, int min_repeat_ms, @@ -186,6 +207,16 @@ Array RPCRunnerNode::Run(const Array& inputs, << "make sure the TVM Python runtime has been loaded successfully."; } return Array(); + +} + +Array RPCRunnerNode::GetOutput(const Array& inputs, + const Array& build_results, int verbose) { + + LOG(FATAL) << "auto_scheduler.rpc_runner.get_output is not registered/implemented. " + << "This is a function registered in Python, " + << "make sure the TVM Python runtime has been loaded successfully."; + throw; } /********** MeasureCallback **********/ @@ -404,6 +435,15 @@ TVM_REGISTER_GLOBAL("auto_scheduler.ProgramRunnerRun") const Array& build_results, int verbose) { return runner->Run(inputs, build_results, verbose); }); + +TVM_REGISTER_GLOBAL("auto_scheduler.ProgramRunnerGetOutput") + .set_body_typed([](const ProgramRunner& runner, const Array& inputs, + const Array& build_results, + int verbose) { + + return runner->GetOutput(inputs, build_results, verbose); }); + + TVM_REGISTER_GLOBAL("auto_scheduler.LocalBuilder") .set_body_typed([](int timeout, int n_parallel, const String& build_func) { return LocalBuilder(timeout, n_parallel, build_func); diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc index af37443d91e2..d9ec392948c5 100644 --- a/src/auto_scheduler/measure_record.cc +++ b/src/auto_scheduler/measure_record.cc @@ -28,6 +28,9 @@ #include #include +#include + + #include #include #include @@ -172,6 +175,23 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> { writer->WriteArrayItem(std::string("")); } writer->WriteArrayItem(static_cast(data.layout_rewrite_option)); + + //DEELVIN: adding serialization of custom_seed and ref tensors + writer->WriteArrayItem(static_cast(data.custom_seed)); + std::string serialized_tensors_file = "tmp.stream"; + + dmlc::Stream* fs = dmlc::Stream::Create(serialized_tensors_file.c_str(),"wb"); + + writer->WriteArrayItem(serialized_tensors_file); + + for (const auto& i : data.ref_output_tensors) { + i.Save(fs); + } + + delete fs; + + + writer->WriteArraySeperator(); writer->BeginArray(false); for (const auto& i : data.task_input_names) { @@ -209,6 +229,24 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> { ICHECK(s); reader->Read(&int_value); data->layout_rewrite_option = ::tvm::auto_scheduler::LayoutRewriteOption(int_value); + +/// Deelvin + s = reader->NextArrayItem(); + reader->Read(&int_value); + data->custom_seed = int_value; + + s = reader->NextArrayItem(); + reader->Read(&str_value); + + dmlc::Stream* fs = dmlc::Stream::Create(str_value.c_str(),"rb"); + + tvm::runtime::NDArray i; + i.Load(fs); + data->ref_output_tensors.push_back(i); + + delete fs; + +/// s = reader->NextArrayItem(); if (s) { reader->BeginArray(); diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc index 262340099cc7..e9cad7897b6b 100755 --- a/src/auto_scheduler/search_task.cc +++ b/src/auto_scheduler/search_task.cc @@ -160,10 +160,35 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target return HardwareParams(); } + +TVM_REGISTER_GLOBAL("auto_scheduler.SetReferenceTensors") + .set_body_typed([](SearchTask task, Array arr) { + auto task_t = const_cast(task.as()); + task_t->SetReferenceTensors(arr); + }); + +TVM_REGISTER_GLOBAL("auto_scheduler.SetTarget") + .set_body_typed([](SearchTask task, Target target, Target target_host) { + auto task_t = const_cast(task.as()); + task_t->SetTarget(target, target_host); + }); + + + +void SearchTaskNode::SetReferenceTensors(Array arr) { + ref_output_tensors = std::move(arr); +} + +void SearchTaskNode::SetTarget(Target _target, Target _target_host) { + target = std::move(_target); + target_host = std::move(_target_host); +} + + SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target target, Target target_host, Optional hardware_params, LayoutRewriteOption layout_rewrite_option, Array task_input_names, - String desc) { + String desc, Array ref_output_tensors, int custom_seed) { CheckAndUpdateHostConsistency(&target, &target_host); auto node = make_object(); node->compute_dag = std::move(compute_dag); @@ -179,6 +204,10 @@ SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target targe } node->layout_rewrite_option = layout_rewrite_option; node->task_input_names = std::move(task_input_names); + + node->custom_seed = custom_seed; + node->ref_output_tensors = ref_output_tensors; + data_ = std::move(node); } @@ -199,10 +228,13 @@ TVM_REGISTER_GLOBAL("auto_scheduler.GetDefaultHardwareParams") TVM_REGISTER_GLOBAL("auto_scheduler.SearchTask") .set_body_typed([](ComputeDAG compute_dag, String workload_key, Target target, Target target_host, Optional hardware_params, - int layout_rewrite_option, Array task_input_names, String desc) { + int layout_rewrite_option, Array task_input_names, + String desc, Array ref_output_tensors, + int custom_seed ) { CheckAndUpdateHostConsistency(&target, &target_host); return SearchTask(compute_dag, workload_key, target, target_host, hardware_params, - LayoutRewriteOption(layout_rewrite_option), task_input_names, desc); + LayoutRewriteOption(layout_rewrite_option), task_input_names, + desc, ref_output_tensors, custom_seed); }); } // namespace auto_scheduler diff --git a/src/runtime/contrib/random/random.cc b/src/runtime/contrib/random/random.cc index 38c2de6555e9..12027b2fd964 100644 --- a/src/runtime/contrib/random/random.cc +++ b/src/runtime/contrib/random/random.cc @@ -121,6 +121,10 @@ TVM_REGISTER_GLOBAL("tvm.contrib.random.normal").set_body([](TVMArgs args, TVMRe TVM_REGISTER_GLOBAL("tvm.contrib.random.random_fill").set_body([](TVMArgs args, TVMRetValue* ret) { RandomThreadLocalEntry* entry = RandomThreadLocalEntry::ThreadLocal(); DLTensor* out = args[0]; + int seed = args[1]; + + // adding seed for reproducability + entry->random_engine.Seed(seed); entry->random_engine.RandomFill(out); });