diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py index f527746454..65d113d5b2 100644 --- a/hls4ml/backends/oneapi/oneapi_backend.py +++ b/hls4ml/backends/oneapi/oneapi_backend.py @@ -153,6 +153,9 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para # TODO: add namespace 'WriteTar': write_tar, } + + if 'use_bsp' in _: + config['IS_BSP'] = True return config diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py index 3106e1e10d..fefc5c14e3 100644 --- a/hls4ml/backends/oneapi/oneapi_types.py +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -170,11 +170,18 @@ def definition_cpp(self, name_suffix='', as_reference=False): else: return f'{self.type.name} {self.name}{name_suffix}' - def declare_cpp(self, pipe_min_size=0, indent=''): - lines = indent + f'class {self.pipe_id};\n' - lines += indent + ( - f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' - + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n' + # Updated pipe min size to be 32 for simulation. + def declare_cpp(self, pipe_min_size=32, indent=''): + # Updated to use streaming beat for restartable streaming kernel. + # Streaming beat is a wrapper type of the actual type with sideband control signals. + # Syntax: using BeatT = sycl::ext::intel::experimental::StreamingBeat; + streaming_beat_t = f"{self.pipe_name}BeatT" + lines = ( + f"{indent}class {self.pipe_id};\n" + f"{indent}using {streaming_beat_t} = " + f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n" + f"{indent}using {self.pipe_name} = sycl::ext::intel::experimental::pipe<" + f"{self.pipe_id}, {streaming_beat_t}, {pipe_min_size}, HostPipePropertiesT>;\n" ) return lines @@ -193,10 +200,13 @@ def definition_cpp(self, name_suffix='', as_reference=True): return f'{self.name}{name_suffix}' def declare_cpp(self, indent=''): - lines = indent + f'class {self.pipe_id};\n' - lines += indent + ( - f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' - + f'{self.type.name}, {self.pragma[-1]}>;\n' + streaming_beat_t = f"{self.pipe_name}BeatT"; + lines = ( + f"{indent}class {self.pipe_id};\n" + f"{indent}using {streaming_beat_t} = " + f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n" + f"{indent}using {self.pipe_name} = " + f"sycl::ext::intel::experimental::pipe<{self.pipe_id}, {streaming_beat_t}, {self.pragma[-1]}>;\n" ) return lines diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt index e2b386d70d..7f85841110 100644 --- a/hls4ml/templates/oneapi/CMakeLists.txt +++ b/hls4ml/templates/oneapi/CMakeLists.txt @@ -38,12 +38,13 @@ set(LIBRARY_NAME myproject-${LIB_STAMP}) # You can also specify a device family (E.g. "Arria10" or "Stratix10") or a # specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP. if(NOT DEFINED FPGA_DEVICE) - set(FPGA_DEVICE "Arria10") + set(FPGA_DEVICE "Agilex7") endif() # Use cmake -DUSER_FPGA_FLAGS= to set extra flags for FPGA backend # compilation. -set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS}) +# -Xsoptimize=latency Turns off the hyper-optimized handshake +set(USER_FPGA_FLAGS -Wno-unused-label;${USER_FPGA_FLAGS};-Xsoptimize=latency) # Use cmake -DUSER_FLAGS= to set extra flags for general compilation. set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS}) diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h index 082ae5dc8c..d1713bb9ec 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.h +++ b/hls4ml/templates/oneapi/firmware/myproject.h @@ -8,6 +8,129 @@ // currently this is fixed using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>)); +// Pipe properties for host pipes. Host pipes connect to the data source DMA and sink DMA. +// They are connected to the first and the last layer to stream data into and out from the kernel. +using HostPipePropertiesT = decltype(sycl::ext::oneapi::experimental::properties( + sycl::ext::intel::experimental::ready_latency<0>, + sycl::ext::intel::experimental::bits_per_symbol<8>, + sycl::ext::intel::experimental::uses_valid, + sycl::ext::intel::experimental::first_symbol_in_high_order_bits, + sycl::ext::intel::experimental::protocol_avalon_streaming_uses_ready +)); + + +namespace nnet { + +#if !defined(IS_BSP) +// Definition for buffer locations for Avalon MM host. +inline constexpr unsigned kInputBufferLocation = 0; +inline constexpr unsigned kOutputBufferLocation = 1; +#endif + +// Implementation of a direct memory access kernel. Move data from source, convert, +// and send to the sink. Adaptive to SYCL HLS and hardware acceleration flow. +template +struct DMA_convert_data { +#if !defined(IS_BSP) + // When targeting a device family, we instantiate an Avalon Memory Mapped Host for + // data transaction between host and the DMA kernel during emulation and simulation. + sycl::ext::oneapi::experimental::annotated_arg, + sycl::ext::intel::experimental::dwidth<16>, + sycl::ext::intel::experimental::buffer_location, + sycl::ext::intel::experimental::read_write_mode_read, + sycl::ext::intel::experimental::wait_request_requested})> +#else + // When targeting oneAPI BSP, we can use USM pointer to access host memory. + src_T *const +#endif + src; + size_t num_iteration; + + [[intel::kernel_args_restrict]] + void operator()() const { + +#if defined(IS_BSP) + // Access data using host pointer. + sycl::ext::intel::host_ptr src_ptr(src); +#else + // Host allocation is not supported when targeting an FPGA family or part number. + src_T *src_ptr(src); +#endif + // First, extract the PipeDataT from the pipe + using PipeDataType = typename nnet::ExtractPipeType::value_type; + // Then, extract the DataT from StreamingBeat + using DstDataType = typename nnet::ExtractDataType::value_type; + constexpr auto dstTypeSize = std::tuple_size{}; + + [[intel::fpga_register]] + typename nnet::ExtractPipeType::value_type packet; + + // Keep sending data to the input layer and keep the kernels running. + for (size_t i = 0; i < num_iteration; i++) { + #pragma unroll + for (size_t j = 0; j < dstTypeSize; j++) { + packet.data[j] = src_ptr[i * dstTypeSize + j]; + } + packet.sop = (i == 0); + // Assert end-of-packet signal after the last iteration. + // All down-stream kernels will stop seeing eop. + packet.eop = (i == (num_iteration - 1)); + dest_pipe::write(packet); + } + } +}; + +// Symmetrical to the DMA_convert_data above, this DMA drains the output pipe and +// writes result to memory. +template +struct DMA_convert_data_back { +#if !defined(IS_BSP) + // Without BSP, instantiate an Avalon Memory Mapped Host to write to host. + sycl::ext::oneapi::experimental::annotated_arg, + sycl::ext::intel::experimental::dwidth<16>, + sycl::ext::intel::experimental::buffer_location, + sycl::ext::intel::experimental::read_write_mode_write, + sycl::ext::intel::experimental::wait_request_requested})> +#else + // USM pointer, otherwise. + dst_T *const +#endif + dst; + size_t num_iteration; + + [[intel::kernel_args_restrict]] + void operator()() const { +#if defined(IS_BSP) + sycl::ext::intel::host_ptr dst_ptr(dst); +#else + dst_T *dst_ptr(dst); +#endif + // First, extract the PipeDataT from the pipe + using PipeDataType = typename nnet::ExtractPipeType::value_type; + // Then, extract the DataT from StreamingBeat + using SrcDataType = typename nnet::ExtractDataType::value_type; + constexpr auto srcTypeSize = std::tuple_size{}; + + [[intel::fpga_register]] + typename nnet::ExtractPipeType::value_type packet; + + // Drain the output pipe and write result to memory. + for (size_t i = 0; i < num_iteration; i++) { + packet = src_pipe::read(); + #pragma unroll + for (size_t j = 0; j < srcTypeSize; j++) { + dst_ptr[i * srcTypeSize + j] = static_cast(packet.data[j].to_double()); + } + } + } +}; + +} // namespace nnet + // Need to declare the input and output pipes // hls-fpga-machine-learning insert inputs diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h index 13de5ab3bb..ce94d59ddc 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h @@ -29,23 +29,33 @@ template void linear_stream // ************************************************* // ReLU Activation // ************************************************* -template void relu_stream() { +template +[[intel::use_stall_enable_clusters]] void relu_stream() { + using namespace nnet; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; ReLUActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; + [[intel::initiation_interval(1)]] + while(keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + [[intel::fpga_register]] auto in_data = data_pipe::read(); +ReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + if (in_data.data[j] > 0) + out_data.data[j] = in_data.data[j]; + else + out_data.data[j] = 0; + } - ReLUPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - if (in_data[j] > 0) - out_data[j] = in_data[j]; - else - out_data[j] = 0; - } + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); - res_pipe::write(out_data); + keep_going = !in_data.eop; + } } } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h index 92c9adc3bb..f5fbb4719d 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h @@ -7,15 +7,34 @@ namespace nnet { -// Note: DataPack logic removed, at least in the initial version +// Restartable streaming kernel implementation. +// Computation is carried out in a while-1 loop as long as there is valid input. +// The loop breaks when the end-of-packet signal is asserted by upstream task. template -void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) { +[[intel::use_stall_enable_clusters]] void dense_resource_stream(const typename CONFIG_T::weight_t weights, const typename CONFIG_T::bias_t biases) { + using namespace nnet; + using DataT = typename ExtractDataType::value_type>::value_type; + using ResT = typename ExtractDataType::value_type>::value_type; + + [[intel::fpga_register]] typename ExtractPipeType::value_type resbeat; - [[intel::fpga_register]] typename ExtractPipeType::value_type res; - [[intel::fpga_register]] auto data = data_pipe::read(); - dense_resource::value_type, typename ExtractPipeType::value_type, - CONFIG_T>(data, res, weights, biases); - res_pipe::write(res); + bool keep_going = true; + bool did_read_input; + [[intel::initiation_interval(1)]] + while (keep_going) { + did_read_input = false; + [[intel::fpga_register]] auto databeat = data_pipe::read(did_read_input); + + if (did_read_input) { + dense_resource(databeat.data, resbeat.data, weights, biases); + + resbeat.sop = databeat.sop; + resbeat.eop = databeat.eop; + + res_pipe::write(resbeat); + keep_going = !databeat.eop; + } + } } } // namespace nnet diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h index 8cf883c1d5..6b908335cb 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h @@ -8,6 +8,8 @@ #include #include +#include // Streaming Beat and pipe properties. + namespace nnet { // Define the pipe type that we use @@ -34,6 +36,15 @@ struct ExtractPipeType struct ExtractDataType { typedef T value_type; }; + +// Specialization on oneAPI StreamingBeat type. +template +struct ExtractDataType> { + typedef DataT value_type; +}; + /* * HLS Shift Register Implementation * To verify a shift register is used in hardware, go to report.html > Area Analysis of System diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp index 82fb60d2f8..b695980388 100644 --- a/hls4ml/templates/oneapi/myproject_test.cpp +++ b/hls4ml/templates/oneapi/myproject_test.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include "firmware/myproject.h" @@ -20,13 +21,74 @@ #define CHECKPOINT 5000 +#if not defined(IS_BSP) +using sycl::ext::intel::experimental::property::usm::buffer_location; +#endif + +// Functions that reads input and prediction data from files. +// Returns `true` if files are read successfully and not empty. +// Returns `false` otherwise. +bool prepare_data_from_file( + std::string &fin_path, + std::string &fpr_path, + std::vector> &inputs, + std::vector> &predictions +) { + // load input data from text file + std::ifstream fin(fin_path.c_str()); + // load predictions from text file + std::ifstream fpr(fpr_path.c_str()); + + std::string iline; + std::string pline; + + if (fin.is_open() && fpr.is_open()) { + size_t num_iterations = 0; + + // Prepare input data from file. Load predictions from file. + for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) { + if (num_iterations % CHECKPOINT == 0) { + std::cout << "Processing input " << num_iterations << std::endl; + } + + std::vector in; + std::vector pr; + float current; + + std::stringstream ssin(iline); + while (ssin >> current) { + in.push_back(current); + } + + std::stringstream sspred(pline); + while (sspred >> current) { + pr.push_back(current); + } + + std::copy(pr.cbegin(), pr.cend(), predictions.back().begin()); + std::copy(in.cbegin(), in.cend(), inputs.back().begin()); + } + fin.close(); + fpr.close(); + if (inputs.empty()) + return false; + else + return true; + } else { + return false; + } +} + int main(int argc, char **argv) { #if FPGA_SIMULATOR +#define NUM_ITERATIONS 5 auto selector = sycl::ext::intel::fpga_simulator_selector_v; #elif FPGA_HARDWARE +#define NUM_ITERATIONS 100 auto selector = sycl::ext::intel::fpga_selector_v; #else // #if FPGA_EMULATOR +#define NUM_ITERATIONS 10 auto selector = sycl::ext::intel::fpga_emulator_selector_v; #endif @@ -44,93 +106,108 @@ int main(int argc, char **argv) { std::cout << "Running on device: " << device.get_info().c_str() << std::endl; - // load input data from text file - std::ifstream fin("tb_data/tb_input_features.dat"); - // load predictions from text file - std::ifstream fpr("tb_data/tb_output_predictions.dat"); - + std::string INPUT_FILE = "tb_data/tb_input_features.dat"; + std::string PRED_FILE = "tb_data/tb_output_predictions.dat"; std::string RESULTS_LOG = "tb_data/results.log"; std::ofstream fout(RESULTS_LOG); - std::string iline; - std::string pline; - - if (fin.is_open() && fpr.is_open()) { - std::vector> predictions; - unsigned int iteration = 0; - for (; std::getline(fin, iline) && std::getline(fpr, pline); iteration++) { - if (iteration % CHECKPOINT == 0) { - std::cout << "Processing input " << iteration << std::endl; - } + // Allocate vectors on stack to hold data from files temporarily. + std::vector> inputs; + std::vector> predictions; + bool file_valid = prepare_data_from_file(INPUT_FILE, PRED_FILE, inputs, predictions); + unsigned int num_iterations; + if (file_valid) { + num_iterations = inputs.size(); + } else { + num_iterations = NUM_ITERATIONS; + } - std::vector in; - std::vector pr; - float current; + // hls-fpga-machine-learning insert runtime contant - std::stringstream ssin(iline); - while (ssin >> current) { - in.push_back(current); - } + try { +#if defined(IS_BSP) + // Allocate host memory if BSP is in use. + float *vals = sycl::malloc_host(kInputSz, q); + if (vals == nullptr) { + std::cerr << "ERROR: host allocation failed for input\n"; + fout.close(); + return 1; + } + float *outputs = sycl::malloc_host(kOutputSz, q); + if (outputs == nullptr) { + std::cerr << "ERROR: host allocation failed for output\n"; + fout.close(); + return 1; + } +#else + float *vals = sycl::malloc_shared(kInputSz, q, sycl::property_list{buffer_location(nnet::kInputBufferLocation)}); + float *outputs = sycl::malloc_shared(kOutputSz, q, sycl::property_list{buffer_location(nnet::kOutputBufferLocation)}); +#endif - std::stringstream sspred(pline); - while (sspred >> current) { - pr.push_back(current); - } + if (file_valid) { + // Start always-run streaming kernel here, instead of inside a loop. + q.single_task(MyProject{}); // hls-fpga-machine-learning insert data - q.single_task(MyProject{}); - // hls-fpga-machine-learning convert output - std::copy(pr.cbegin(), pr.cend(), predictions.back().begin()); - - for (auto outval : outputs) { - fout << outval << " "; - } - fout << std::endl; - if (iteration % CHECKPOINT == 0) { - std::cout << "Predictions" << std::endl; - // hls-fpga-machine-learning insert predictions - for (auto predval : pr) { - std::cout << predval << " "; + // Print output from kernel and from prediction file. + for (int i = 0; i < num_iterations; i++) { + for (int j = 0; j < kOutLayerSize; j++) { + fout << outputs[i * kOutLayerSize + j] << " "; } - std::cout << std::endl; - std::cout << "Quantized predictions" << std::endl; - // hls-fpga-machine-learning insert quantized - for (auto outval : outputs) { - std::cout << outval << " "; + fout << std::endl; + if (i % CHECKPOINT == 0) { + std::cout << "Predictions" << std::endl; + // hls-fpga-machine-learning insert predictions + for (auto predval : predictions[i]) { + std::cout << predval << " "; + } + std::cout << std::endl; + std::cout << "Quantized predictions" << std::endl; + // hls-fpga-machine-learning insert quantized + for (int j = 0; j < kOutLayerSize; j++) { + std::cout << outputs[i * kOutLayerSize + j] << " "; + } + std::cout << std::endl; } - std::cout << std::endl; } - } - fin.close(); - fpr.close(); - } else { - const unsigned int num_iterations = 10; - std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations - << " invocations." << std::endl; - - // hls-fpga-machine-learning insert top-level-function - for (int i = 0; i < num_iterations; i++) { - // hls-fpga-machine-learning insert zero + } else { + std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations + << " invocations." << std::endl; q.single_task(MyProject{}); + // hls-fpga-machine-learning insert top-level-function + // hls-fpga-machine-learning insert zero // hls-fpga-machine-learning convert output - for (auto outval : outputs) { - std::cout << outval << " "; - } - std::cout << std::endl; - - for (auto outval : outputs) { - fout << outval << " "; + for (int i = 0; i < num_iterations; i++) { + for (int j = 0; j < kOutLayerSize; j++) { + std::cout << outputs[i * kOutLayerSize + j] << " "; + fout << outputs[i * kOutLayerSize + j] << " "; + } + std::cout << std::endl; + fout << std::endl; } - fout << std::endl; } + sycl::free(vals, q); + sycl::free(outputs, q); + fout.close(); + std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl; + } catch (sycl::exception const &e) { + // Catches exceptions in the host code. + std::cerr << "Caught a SYCL host exception:\n" + << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.code().value() == CL_DEVICE_NOT_FOUND) + { + std::cerr << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cerr << "Run sys_check in the oneAPI root directory to verify.\n"; + std::cerr << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); } - q.wait(); - - fout.close(); - std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl; - return 0; } diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index e93f8b5ca3..ee8b530ca7 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -137,8 +137,14 @@ def write_project_cpp(self, model): elif '// hls-fpga-machine-learning read in' in line: newline = line if io_type == 'io_parallel': + restartable_kernel_loop = ( + f"bool keep_going = true;\n\n" + f"{indent}while (keep_going) {{\n" + ) + newline += indent + restartable_kernel_loop for inp in model_inputs: - newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n' + newline += indent * 2 + f'auto {inp.name}_beat = {inp.pipe_name}::read();\n' + newline += indent * 2 + f'auto {inp.name} = {inp.name}_beat.data;\n' # for streaming we don't need to read it in # Insert weights @@ -151,16 +157,21 @@ def write_project_cpp(self, model): # Insert task sequences elif '// hls-fpga-machine-learning declare task sequences' in line: - newline = line if io_type == 'io_stream': # only need this for io_stream + newline = line for layer in model.get_layers(): ts = layer.get_attr('tast_sequence_cpp') if ts: newline += ' ' + ts + '\n' + else: + newline = indent + line # Neural net instantiation elif '// hls-fpga-machine-learning insert layers' in line: - newline = line + '\n' + if io_type == 'io_parallel': + newline = indent + line + '\n' + else: + newline = line + '\n' for layer in model.get_layers(): if io_type != 'io_stream': vars = layer.get_variables() @@ -168,14 +179,14 @@ def write_project_cpp(self, model): if var not in model_inputs: def_cpp = var.definition_cpp() if def_cpp is not None: - newline += ' ' + def_cpp + ';\n' + newline += indent * 2 + def_cpp + ';\n' func = ( layer.get_attr('function_cpp') if io_type == 'io_parallel' else layer.get_attr('stream_function_cpp') ) if func: - newline += ' ' + func + '\n' + newline += (indent * 2 if io_type == 'io_parallel' else indent) + func + '\n' if model.config.trace_output and layer.get_attr('trace', False): newline += '#ifndef HLS_SYNTHESIS\n' for var in vars: @@ -188,8 +199,15 @@ def write_project_cpp(self, model): elif '// hls-fpga-machine-learning return' in line: newline = line if io_type == 'io_parallel': + newline = indent + newline for out in model_outputs: - newline += indent + f'{out.pipe_name}::write({out.name});\n' + out_beat = f"{out.name}_beat" + newline += indent * 2 + f'typename nnet::ExtractPipeType<{out.pipe_name}>::value_type {out_beat};\n' + newline += indent * 2 + f'{out_beat}.data = {out.name};\n' + newline += indent * 2 + f'{out.pipe_name}::write({out_beat});\n' + newline += indent * 2 + '// stops the kernel when the last input seen.\n' + newline += indent * 2 + f'keep_going = !{model_inputs[0].name}_beat.eop;\n' + newline += f"{indent}}}\n" # don't need to add anything in io_stream # Just copy line @@ -396,27 +414,39 @@ def write_test_bench(self, model): newline = line for bram in model_brams: newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + elif '// hls-fpga-machine-learning insert runtime contant' in line: + newline = line + insert_constant_lines = ( + f'{indent}const size_t kInputSz = {model_inputs[0].size_cpp()} * num_iterations;\n' + f'{indent}const size_t kOutputSz = {model_outputs[0].size_cpp()} * num_iterations;\n' + f'{indent}const size_t kInputLayerSize = {model_inputs[0].size_cpp()};\n' + f'{indent}const size_t kOutLayerSize = {model_outputs[0].size_cpp()};\n' + ) + newline += insert_constant_lines; elif '// hls-fpga-machine-learning insert zero' in line: newline = line inp = model_inputs[0] - newline += indent + f'float vals[{inp.size_cpp()}]; \n' - newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n' - newline += indent + ' vals[j] = 0.0; \n' - newline += indent + '}\n' - newline += indent + f'nnet::convert_data(q, vals);\n' + insert_zero_lines = ( + f'{indent}for (int j = 0 ; j < kInputSz; j++)\n' + f'{indent} vals[j] = 0.0;\n' + f'{indent}q.single_task(nnet::DMA_convert_data{{vals, num_iterations}});\n' + ) + newline += insert_zero_lines elif '// hls-fpga-machine-learning insert data' in line: newline = line inp = model_inputs[0] - newline += indent + f'float vals[{inp.size_cpp()}]; \n' - newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n' - newline += indent + ' vals[j] = in[j]; \n' - newline += indent + '}\n' - newline += indent + f'nnet::convert_data(q, vals);\n' + insert_data_lines = ( + f'{indent}for (int i = 0; i < num_iterations; i++)\n' + f'{indent} for (int j = 0 ; j < kInputLayerSize; j++)\n' + f'{indent} vals[i * kInputLayerSize + j] = inputs[i][j]; \n' + f'{indent}q.single_task(nnet::DMA_convert_data{{vals, num_iterations}});\n' + ) + newline += insert_data_lines elif '// hls-fpga-machine-learning convert output' in line: newline = line out = model_outputs[0] - newline += indent + f'float outputs[{out.size_cpp()}];\n' - newline += indent + f'nnet::convert_data_back<{out.pipe_name}, float, {out.size_cpp()}>(q, outputs);\n' + newline += \ + f'{indent}q.single_task(nnet::DMA_convert_data_back<{out.pipe_name}, float>{{outputs, num_iterations}}).wait();\n' else: newline = line