diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index f527746454..65d113d5b2 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -153,6 +153,9 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para
             # TODO:  add namespace
             'WriteTar': write_tar,
         }
+        
+        if 'use_bsp' in _:
+            config['IS_BSP'] = True
 
         return config
 
diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index 3106e1e10d..fefc5c14e3 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -170,11 +170,18 @@ def definition_cpp(self, name_suffix='', as_reference=False):
         else:
             return f'{self.type.name} {self.name}{name_suffix}'
 
-    def declare_cpp(self, pipe_min_size=0, indent=''):
-        lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + (
-            f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
-            + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n'
+    # Updated pipe min size to be 32 for simulation.
+    def declare_cpp(self, pipe_min_size=32, indent=''):
+        # Updated to use streaming beat for restartable streaming kernel.
+        # Streaming beat is a wrapper type of the actual type with sideband control signals.
+        # Syntax: using BeatT = sycl::ext::intel::experimental::StreamingBeat<DataT, eop, empty>;
+        streaming_beat_t = f"{self.pipe_name}BeatT"
+        lines = (
+            f"{indent}class {self.pipe_id};\n"
+            f"{indent}using {streaming_beat_t} = "
+            f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n"
+            f"{indent}using {self.pipe_name} = sycl::ext::intel::experimental::pipe<"
+            f"{self.pipe_id}, {streaming_beat_t}, {pipe_min_size}, HostPipePropertiesT>;\n"
         )
         return lines
 
@@ -193,10 +200,13 @@ def definition_cpp(self, name_suffix='', as_reference=True):
         return f'{self.name}{name_suffix}'
 
     def declare_cpp(self, indent=''):
-        lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + (
-            f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
-            + f'{self.type.name}, {self.pragma[-1]}>;\n'
+        streaming_beat_t = f"{self.pipe_name}BeatT";
+        lines = (
+            f"{indent}class {self.pipe_id};\n"
+            f"{indent}using {streaming_beat_t} = "
+            f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n"
+            f"{indent}using {self.pipe_name} = "
+            f"sycl::ext::intel::experimental::pipe<{self.pipe_id}, {streaming_beat_t}, {self.pragma[-1]}>;\n"
         )
         return lines
 
diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
index e2b386d70d..7f85841110 100644
--- a/hls4ml/templates/oneapi/CMakeLists.txt
+++ b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -38,12 +38,13 @@ set(LIBRARY_NAME myproject-${LIB_STAMP})
 # You can also specify a device family (E.g. "Arria10" or "Stratix10") or a
 # specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP.
 if(NOT DEFINED FPGA_DEVICE)
-    set(FPGA_DEVICE "Arria10")
+    set(FPGA_DEVICE "Agilex7")
 endif()
 
 # Use cmake -DUSER_FPGA_FLAGS=<flags> to set extra flags for FPGA backend
 # compilation.
-set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS})
+# -Xsoptimize=latency Turns off the hyper-optimized handshake
+set(USER_FPGA_FLAGS -Wno-unused-label;${USER_FPGA_FLAGS};-Xsoptimize=latency)
 
 # Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
 set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS})
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
index 082ae5dc8c..d1713bb9ec 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.h
+++ b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -8,6 +8,129 @@
 // currently this is fixed
 using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>));
 
+// Pipe properties for host pipes. Host pipes connect to the data source DMA and sink DMA.
+// They are connected to the first and the last layer to stream data into and out from the kernel.
+using HostPipePropertiesT = decltype(sycl::ext::oneapi::experimental::properties(
+    sycl::ext::intel::experimental::ready_latency<0>,
+    sycl::ext::intel::experimental::bits_per_symbol<8>,
+    sycl::ext::intel::experimental::uses_valid<true>,
+    sycl::ext::intel::experimental::first_symbol_in_high_order_bits<true>,
+    sycl::ext::intel::experimental::protocol_avalon_streaming_uses_ready
+));
+
+
+namespace nnet {
+
+#if !defined(IS_BSP)
+// Definition for buffer locations for Avalon MM host.
+inline constexpr unsigned kInputBufferLocation = 0;
+inline constexpr unsigned kOutputBufferLocation = 1;
+#endif
+
+// Implementation of a direct memory access kernel. Move data from source, convert, 
+// and send to the sink. Adaptive to SYCL HLS and hardware acceleration flow.
+template <class src_T, class dest_pipe> 
+struct DMA_convert_data {
+#if !defined(IS_BSP)
+    // When targeting a device family, we instantiate an Avalon Memory Mapped Host for 
+    // data transaction between host and the DMA kernel during emulation and simulation.
+    sycl::ext::oneapi::experimental::annotated_arg<src_T *, 
+      decltype(sycl::ext::oneapi::experimental::properties{
+          sycl::ext::intel::experimental::latency<0>,
+          sycl::ext::intel::experimental::dwidth<16>,
+          sycl::ext::intel::experimental::buffer_location<kInputBufferLocation>,
+          sycl::ext::intel::experimental::read_write_mode_read,
+          sycl::ext::intel::experimental::wait_request_requested})>
+#else
+    // When targeting oneAPI BSP, we can use USM pointer to access host memory.
+    src_T *const
+#endif
+        src;
+    size_t num_iteration;
+
+    [[intel::kernel_args_restrict]]
+    void operator()() const {
+        
+#if defined(IS_BSP)
+        // Access data using host pointer.
+        sycl::ext::intel::host_ptr<src_T> src_ptr(src);
+#else
+        // Host allocation is not supported when targeting an FPGA family or part number.
+        src_T *src_ptr(src);
+#endif
+        // First, extract the PipeDataT from the pipe
+        using PipeDataType = typename nnet::ExtractPipeType<dest_pipe>::value_type;
+        // Then, extract the DataT from StreamingBeat
+        using DstDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
+        constexpr auto dstTypeSize = std::tuple_size<DstDataType>{};
+
+        [[intel::fpga_register]]
+        typename nnet::ExtractPipeType<dest_pipe>::value_type packet;
+
+        // Keep sending data to the input layer and keep the kernels running.
+        for (size_t i = 0; i < num_iteration; i++) {
+            #pragma unroll
+            for (size_t j = 0; j < dstTypeSize; j++) {
+                packet.data[j] = src_ptr[i * dstTypeSize + j];
+            }
+            packet.sop = (i == 0);
+            // Assert end-of-packet signal after the last iteration.
+            // All down-stream kernels will stop seeing eop.
+            packet.eop = (i == (num_iteration - 1));
+            dest_pipe::write(packet);
+        }
+    }
+};
+
+// Symmetrical to the DMA_convert_data above, this DMA drains the output pipe and 
+// writes result to memory.
+template <class src_pipe, class dst_T> 
+struct DMA_convert_data_back {
+#if !defined(IS_BSP)
+    // Without BSP, instantiate an Avalon Memory Mapped Host to write to host.
+    sycl::ext::oneapi::experimental::annotated_arg<dst_T *, 
+      decltype(sycl::ext::oneapi::experimental::properties{
+          sycl::ext::intel::experimental::latency<0>,
+          sycl::ext::intel::experimental::dwidth<16>,
+          sycl::ext::intel::experimental::buffer_location<kOutputBufferLocation>,
+          sycl::ext::intel::experimental::read_write_mode_write,
+          sycl::ext::intel::experimental::wait_request_requested})>
+#else
+    // USM pointer, otherwise.
+    dst_T *const
+#endif
+        dst;
+    size_t num_iteration;
+
+    [[intel::kernel_args_restrict]]
+    void operator()() const {
+#if defined(IS_BSP)
+        sycl::ext::intel::host_ptr<dst_T> dst_ptr(dst);
+#else
+        dst_T *dst_ptr(dst);
+#endif
+        // First, extract the PipeDataT from the pipe
+        using PipeDataType = typename nnet::ExtractPipeType<src_pipe>::value_type;
+        // Then, extract the DataT from StreamingBeat
+        using SrcDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
+        constexpr auto srcTypeSize = std::tuple_size<SrcDataType>{};
+
+        [[intel::fpga_register]] 
+        typename nnet::ExtractPipeType<src_pipe>::value_type packet;
+        
+        // Drain the output pipe and write result to memory.
+        for (size_t i = 0; i < num_iteration; i++) {
+            packet = src_pipe::read();
+            #pragma unroll
+            for (size_t j = 0; j < srcTypeSize; j++) {
+                dst_ptr[i * srcTypeSize + j] = static_cast<dst_T>(packet.data[j].to_double());
+            }
+        }
+    }
+};
+
+}   // namespace nnet
+
 // Need to declare the input and output pipes
 
 // hls-fpga-machine-learning insert inputs
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
index 13de5ab3bb..ce94d59ddc 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -29,23 +29,33 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void linear_stream
 // *************************************************
 //       ReLU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void relu_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> 
+[[intel::use_stall_enable_clusters]] void relu_stream() {
+    using namespace nnet;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+    
+    bool keep_going = true;
 ReLUActLoop:
-    [[intel::initiation_interval(
-        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
+    [[intel::initiation_interval(1)]]
+    while(keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            [[intel::fpga_register]] auto in_data = data_pipe::read();
+ReLUPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                if (in_data.data[j] > 0)
+                    out_data.data[j] = in_data.data[j];
+                else
+                    out_data.data[j] = 0;
+            }
 
-    ReLUPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            if (in_data[j] > 0)
-                out_data[j] = in_data[j];
-            else
-                out_data[j] = 0;
-        }
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
 
-        res_pipe::write(out_data);
+            keep_going = !in_data.eop;
+        }
     }
 }
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
index 92c9adc3bb..f5fbb4719d 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
@@ -7,15 +7,34 @@
 
 namespace nnet {
 
-// Note:  DataPack logic removed, at least in the initial version
+// Restartable streaming kernel implementation.
+// Computation is carried out in a while-1 loop as long as there is valid input.
+// The loop breaks when the end-of-packet signal is asserted by upstream task.
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) {
+[[intel::use_stall_enable_clusters]] void dense_resource_stream(const typename CONFIG_T::weight_t weights, const typename CONFIG_T::bias_t biases) {
+    using namespace nnet;
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type resbeat;
 
-    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res;
-    [[intel::fpga_register]] auto data = data_pipe::read();
-    dense_resource<typename ExtractPipeType<data_pipe>::value_type, typename ExtractPipeType<res_pipe>::value_type,
-                   CONFIG_T>(data, res, weights, biases);
-    res_pipe::write(res);
+    bool keep_going = true;
+    bool did_read_input;
+    [[intel::initiation_interval(1)]]
+    while (keep_going) {
+        did_read_input = false;
+        [[intel::fpga_register]] auto databeat = data_pipe::read(did_read_input);
+
+        if (did_read_input) {
+            dense_resource<DataT, ResT, CONFIG_T>(databeat.data, resbeat.data, weights, biases);
+
+            resbeat.sop = databeat.sop;
+            resbeat.eop = databeat.eop;
+
+            res_pipe::write(resbeat);
+            keep_going = !databeat.eop;
+        }
+    }
 }
 
 } // namespace nnet
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
index 8cf883c1d5..6b908335cb 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
@@ -8,6 +8,8 @@
 #include <tuple>
 #include <utility>
 
+#include <sycl/ext/intel/prototype/pipes_ext.hpp>   // Streaming Beat and pipe properties.
+
 namespace nnet {
 
 // Define the pipe type that we use
@@ -34,6 +36,15 @@ struct ExtractPipeType<PipeClass<PipeName, PipeDataT, kPipeMinCapacity, PipeProp
     typedef PipeDataT value_type;
 };
 
+// Helper template for extracting datatype from oneAPI StreamingBeat type.
+template <typename T> struct ExtractDataType { typedef T value_type; };
+
+// Specialization on oneAPI StreamingBeat type.
+template <typename DataT, bool EnableSOP, bool EnableEmpty>
+struct ExtractDataType<sycl::ext::intel::experimental::StreamingBeat<DataT, EnableSOP, EnableEmpty>> {
+    typedef DataT value_type;
+};
+
 /*
  * HLS Shift Register Implementation
  * To verify a shift register is used in hardware, go to report.html > Area Analysis of System
diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp
index 82fb60d2f8..b695980388 100644
--- a/hls4ml/templates/oneapi/myproject_test.cpp
+++ b/hls4ml/templates/oneapi/myproject_test.cpp
@@ -4,6 +4,7 @@
 #include <fstream>
 #include <iostream>
 #include <string>
+#include <string_view>
 #include <vector>
 
 #include "firmware/myproject.h"
@@ -20,13 +21,74 @@
 
 #define CHECKPOINT 5000
 
+#if not defined(IS_BSP)
+using sycl::ext::intel::experimental::property::usm::buffer_location;
+#endif
+
+// Functions that reads input and prediction data from files.
+// Returns `true` if files are read successfully and not empty.
+// Returns `false` otherwise.
+bool prepare_data_from_file(
+    std::string &fin_path,
+    std::string &fpr_path,
+    std::vector<std::vector<float>> &inputs,
+    std::vector<std::vector<float>> &predictions
+) {
+    // load input data from text file
+    std::ifstream fin(fin_path.c_str());
+    // load predictions from text file
+    std::ifstream fpr(fpr_path.c_str());
+    
+    std::string iline;
+    std::string pline;
+
+    if (fin.is_open() && fpr.is_open()) {
+        size_t num_iterations = 0;
+        
+        // Prepare input data from file. Load predictions from file.
+        for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
+            if (num_iterations % CHECKPOINT == 0) {
+                std::cout << "Processing input " << num_iterations << std::endl;
+            }
+
+            std::vector<float> in;
+            std::vector<float> pr;
+            float current;
+
+            std::stringstream ssin(iline);
+            while (ssin >> current) {
+                in.push_back(current);
+            }
+
+            std::stringstream sspred(pline);
+            while (sspred >> current) {
+                pr.push_back(current);
+            }
+
+            std::copy(pr.cbegin(), pr.cend(), predictions.back().begin());
+            std::copy(in.cbegin(), in.cend(), inputs.back().begin());
+        }
+        fin.close();
+        fpr.close();
+        if (inputs.empty())
+            return false;
+        else
+            return true;
+    } else {
+        return false;
+    }
+}
+
 int main(int argc, char **argv) {
 
 #if FPGA_SIMULATOR
+#define NUM_ITERATIONS 5
     auto selector = sycl::ext::intel::fpga_simulator_selector_v;
 #elif FPGA_HARDWARE
+#define NUM_ITERATIONS 100
     auto selector = sycl::ext::intel::fpga_selector_v;
 #else // #if FPGA_EMULATOR
+#define NUM_ITERATIONS 10
     auto selector = sycl::ext::intel::fpga_emulator_selector_v;
 #endif
 
@@ -44,93 +106,108 @@ int main(int argc, char **argv) {
 
     std::cout << "Running on device: " << device.get_info<sycl::info::device::name>().c_str() << std::endl;
 
-    // load input data from text file
-    std::ifstream fin("tb_data/tb_input_features.dat");
-    // load predictions from text file
-    std::ifstream fpr("tb_data/tb_output_predictions.dat");
-
+    std::string INPUT_FILE  = "tb_data/tb_input_features.dat";
+    std::string PRED_FILE   = "tb_data/tb_output_predictions.dat";
     std::string RESULTS_LOG = "tb_data/results.log";
     std::ofstream fout(RESULTS_LOG);
 
-    std::string iline;
-    std::string pline;
-
-    if (fin.is_open() && fpr.is_open()) {
-        std::vector<std::vector<float>> predictions;
-        unsigned int iteration = 0;
-        for (; std::getline(fin, iline) && std::getline(fpr, pline); iteration++) {
-            if (iteration % CHECKPOINT == 0) {
-                std::cout << "Processing input " << iteration << std::endl;
-            }
+    // Allocate vectors on stack to hold data from files temporarily.
+    std::vector<std::vector<float>> inputs;
+    std::vector<std::vector<float>> predictions;
+    bool file_valid = prepare_data_from_file(INPUT_FILE, PRED_FILE, inputs, predictions);
+    unsigned int num_iterations;
+    if (file_valid) {
+        num_iterations = inputs.size();
+    } else {
+        num_iterations = NUM_ITERATIONS;
+    }
 
-            std::vector<float> in;
-            std::vector<float> pr;
-            float current;
+    // hls-fpga-machine-learning insert runtime contant
 
-            std::stringstream ssin(iline);
-            while (ssin >> current) {
-                in.push_back(current);
-            }
+    try {
+#if defined(IS_BSP)
+        // Allocate host memory if BSP is in use.
+        float *vals = sycl::malloc_host<float>(kInputSz, q);
+        if (vals == nullptr) {
+            std::cerr << "ERROR: host allocation failed for input\n";
+            fout.close();
+            return 1;
+        }
+        float *outputs = sycl::malloc_host<float>(kOutputSz, q);
+        if (outputs == nullptr) {
+            std::cerr << "ERROR: host allocation failed for output\n";
+            fout.close();
+            return 1;
+        }    
+#else
+        float *vals = sycl::malloc_shared<float>(kInputSz, q, sycl::property_list{buffer_location(nnet::kInputBufferLocation)});
+        float *outputs = sycl::malloc_shared<float>(kOutputSz, q, sycl::property_list{buffer_location(nnet::kOutputBufferLocation)});
+#endif
 
-            std::stringstream sspred(pline);
-            while (sspred >> current) {
-                pr.push_back(current);
-            }
+        if (file_valid) {
+            // Start always-run streaming kernel here, instead of inside a loop.
+            q.single_task(MyProject{});
 
             // hls-fpga-machine-learning insert data
 
-            q.single_task(MyProject{});
-
             // hls-fpga-machine-learning convert output
 
-            std::copy(pr.cbegin(), pr.cend(), predictions.back().begin());
-
-            for (auto outval : outputs) {
-                fout << outval << " ";
-            }
-            fout << std::endl;
-            if (iteration % CHECKPOINT == 0) {
-                std::cout << "Predictions" << std::endl;
-                // hls-fpga-machine-learning insert predictions
-                for (auto predval : pr) {
-                    std::cout << predval << " ";
+            // Print output from kernel and from prediction file.
+            for (int i = 0; i < num_iterations; i++) {
+                for (int j = 0; j < kOutLayerSize; j++) {
+                    fout << outputs[i * kOutLayerSize + j] << " ";
                 }
-                std::cout << std::endl;
-                std::cout << "Quantized predictions" << std::endl;
-                // hls-fpga-machine-learning insert quantized
-                for (auto outval : outputs) {
-                    std::cout << outval << " ";
+                fout << std::endl;
+                if (i % CHECKPOINT == 0) {
+                    std::cout << "Predictions" << std::endl;
+                    // hls-fpga-machine-learning insert predictions
+                    for (auto predval : predictions[i]) {
+                        std::cout << predval << " ";
+                    }
+                    std::cout << std::endl;
+                    std::cout << "Quantized predictions" << std::endl;
+                    // hls-fpga-machine-learning insert quantized
+                    for (int j = 0; j < kOutLayerSize; j++) {
+                        std::cout << outputs[i * kOutLayerSize + j] << " ";
+                    }
+                    std::cout << std::endl;
                 }
-                std::cout << std::endl;
             }
-        }
-        fin.close();
-        fpr.close();
-    } else {
-        const unsigned int num_iterations = 10;
-        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
-                  << " invocations." << std::endl;
-
-        // hls-fpga-machine-learning insert top-level-function
-        for (int i = 0; i < num_iterations; i++) {
-            // hls-fpga-machine-learning insert zero
+        } else {
+            std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
+                    << " invocations." << std::endl;
             q.single_task(MyProject{});
+            // hls-fpga-machine-learning insert top-level-function
+            // hls-fpga-machine-learning insert zero
             // hls-fpga-machine-learning convert output
-            for (auto outval : outputs) {
-                std::cout << outval << " ";
-            }
-            std::cout << std::endl;
-
-            for (auto outval : outputs) {
-                fout << outval << " ";
+            for (int i = 0; i < num_iterations; i++) {
+                for (int j = 0; j < kOutLayerSize; j++) {
+                    std::cout << outputs[i * kOutLayerSize + j] << " ";
+                    fout << outputs[i * kOutLayerSize + j] << " ";
+                }
+                std::cout << std::endl;
+                fout << std::endl;
             }
-            fout << std::endl;
         }
+        sycl::free(vals, q);
+        sycl::free(outputs, q);
+        fout.close();
+        std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+    } catch (sycl::exception const &e) {
+        // Catches exceptions in the host code.
+        std::cerr << "Caught a SYCL host exception:\n"
+                  << e.what() << "\n";
+
+        // Most likely the runtime couldn't find FPGA hardware!
+        if (e.code().value() == CL_DEVICE_NOT_FOUND)
+        {
+            std::cerr << "If you are targeting an FPGA, please ensure that your "
+                         "system has a correctly configured FPGA board.\n";
+            std::cerr << "Run sys_check in the oneAPI root directory to verify.\n";
+            std::cerr << "If you are targeting the FPGA emulator, compile with "
+                         "-DFPGA_EMULATOR.\n";
+        }
+        std::terminate();
     }
-    q.wait();
-
-    fout.close();
-    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
-
     return 0;
 }
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index e93f8b5ca3..ee8b530ca7 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -137,8 +137,14 @@ def write_project_cpp(self, model):
                 elif '// hls-fpga-machine-learning read in' in line:
                     newline = line
                     if io_type == 'io_parallel':
+                        restartable_kernel_loop = (
+                            f"bool keep_going = true;\n\n"
+                            f"{indent}while (keep_going) {{\n"
+                        )
+                        newline += indent + restartable_kernel_loop
                         for inp in model_inputs:
-                            newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n'
+                            newline += indent * 2 + f'auto {inp.name}_beat = {inp.pipe_name}::read();\n'
+                            newline += indent * 2 + f'auto {inp.name} = {inp.name}_beat.data;\n'
                     # for streaming we don't need to read it in
 
                 # Insert weights
@@ -151,16 +157,21 @@ def write_project_cpp(self, model):
 
                 # Insert task sequences
                 elif '// hls-fpga-machine-learning declare task sequences' in line:
-                    newline = line
                     if io_type == 'io_stream':  # only need this for io_stream
+                        newline = line
                         for layer in model.get_layers():
                             ts = layer.get_attr('tast_sequence_cpp')
                             if ts:
                                 newline += '    ' + ts + '\n'
+                    else:
+                        newline = indent + line
 
                 # Neural net instantiation
                 elif '// hls-fpga-machine-learning insert layers' in line:
-                    newline = line + '\n'
+                    if io_type == 'io_parallel':
+                        newline = indent + line + '\n'
+                    else:
+                        newline = line + '\n'
                     for layer in model.get_layers():
                         if io_type != 'io_stream':
                             vars = layer.get_variables()
@@ -168,14 +179,14 @@ def write_project_cpp(self, model):
                                 if var not in model_inputs:
                                     def_cpp = var.definition_cpp()
                                     if def_cpp is not None:
-                                        newline += '    ' + def_cpp + ';\n'
+                                        newline += indent * 2 + def_cpp + ';\n'
                         func = (
                             layer.get_attr('function_cpp')
                             if io_type == 'io_parallel'
                             else layer.get_attr('stream_function_cpp')
                         )
                         if func:
-                            newline += '    ' + func + '\n'
+                            newline += (indent * 2 if io_type == 'io_parallel' else indent) + func + '\n'
                             if model.config.trace_output and layer.get_attr('trace', False):
                                 newline += '#ifndef HLS_SYNTHESIS\n'
                                 for var in vars:
@@ -188,8 +199,15 @@ def write_project_cpp(self, model):
                 elif '// hls-fpga-machine-learning return' in line:
                     newline = line
                     if io_type == 'io_parallel':
+                        newline = indent + newline
                         for out in model_outputs:
-                            newline += indent + f'{out.pipe_name}::write({out.name});\n'
+                            out_beat = f"{out.name}_beat"
+                            newline += indent * 2 + f'typename nnet::ExtractPipeType<{out.pipe_name}>::value_type {out_beat};\n'
+                            newline += indent * 2 + f'{out_beat}.data = {out.name};\n'
+                            newline += indent * 2 + f'{out.pipe_name}::write({out_beat});\n'
+                        newline += indent * 2 + '// stops the kernel when the last input seen.\n'
+                        newline += indent * 2 + f'keep_going = !{model_inputs[0].name}_beat.eop;\n'
+                        newline += f"{indent}}}\n"
                     # don't need to add anything in io_stream
 
                 # Just copy line
@@ -396,27 +414,39 @@ def write_test_bench(self, model):
                     newline = line
                     for bram in model_brams:
                         newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+                elif '// hls-fpga-machine-learning insert runtime contant' in line:
+                    newline = line
+                    insert_constant_lines = (
+                        f'{indent}const size_t kInputSz = {model_inputs[0].size_cpp()} * num_iterations;\n'
+                        f'{indent}const size_t kOutputSz = {model_outputs[0].size_cpp()} * num_iterations;\n'
+                        f'{indent}const size_t kInputLayerSize = {model_inputs[0].size_cpp()};\n'
+                        f'{indent}const size_t kOutLayerSize = {model_outputs[0].size_cpp()};\n'
+                    )
+                    newline += insert_constant_lines;
                 elif '// hls-fpga-machine-learning insert zero' in line:
                     newline = line
                     inp = model_inputs[0]
-                    newline += indent + f'float vals[{inp.size_cpp()}]; \n'
-                    newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n'
-                    newline += indent + '    vals[j] = 0.0; \n'
-                    newline += indent + '}\n'
-                    newline += indent + f'nnet::convert_data<float, {inp.pipe_name}, {inp.size_cpp()}>(q, vals);\n'
+                    insert_zero_lines = (
+                        f'{indent}for (int j = 0 ; j < kInputSz; j++)\n'
+                        f'{indent}    vals[j] = 0.0;\n'
+                        f'{indent}q.single_task(nnet::DMA_convert_data<float, {inp.pipe_name}>{{vals, num_iterations}});\n'
+                    )
+                    newline += insert_zero_lines
                 elif '// hls-fpga-machine-learning insert data' in line:
                     newline = line
                     inp = model_inputs[0]
-                    newline += indent + f'float vals[{inp.size_cpp()}]; \n'
-                    newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n'
-                    newline += indent + '    vals[j] = in[j]; \n'
-                    newline += indent + '}\n'
-                    newline += indent + f'nnet::convert_data<float, {inp.pipe_name}, {inp.size_cpp()}>(q, vals);\n'
+                    insert_data_lines = (
+                        f'{indent}for (int i = 0; i < num_iterations; i++)\n'
+                        f'{indent}    for (int j = 0 ; j < kInputLayerSize; j++)\n'
+                        f'{indent}        vals[i * kInputLayerSize + j] = inputs[i][j]; \n'
+                        f'{indent}q.single_task(nnet::DMA_convert_data<float, {inp.pipe_name}>{{vals, num_iterations}});\n'
+                    )
+                    newline += insert_data_lines
                 elif '// hls-fpga-machine-learning convert output' in line:
                     newline = line
                     out = model_outputs[0]
-                    newline += indent + f'float outputs[{out.size_cpp()}];\n'
-                    newline += indent + f'nnet::convert_data_back<{out.pipe_name}, float, {out.size_cpp()}>(q, outputs);\n'
+                    newline += \
+                        f'{indent}q.single_task(nnet::DMA_convert_data_back<{out.pipe_name}, float>{{outputs, num_iterations}}).wait();\n'
                 else:
                     newline = line