diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h
index 90b0811..843a322 100644
--- a/include/yateto/TensorView.h
+++ b/include/yateto/TensorView.h
@@ -7,13 +7,25 @@
 #include <limits>
 #include <type_traits>
 
+#if defined(__CUDACC__) || defined(__HIP__)
+#define YATETO_HOST __host__
+#define YATETO_DEVICE __device__
+#else
+#define YATETO_HOST
+#define YATETO_DEVICE
+#endif
+
+#define YATETO_HOSTDEVICE YATETO_HOST YATETO_DEVICE
+
 namespace yateto {
   template<typename uint_t=unsigned>
   class slice {
   public:
-    explicit slice(uint_t start = 0, uint_t stop = std::numeric_limits<uint_t>::max())
+#pragma omp declare target begin
+    YATETO_HOSTDEVICE explicit slice(uint_t start = 0, uint_t stop = std::numeric_limits<uint_t>::max())
       : start(start), stop(stop)
         {}
+#pragma omp declare target end
 
     uint_t start;
     uint_t stop;
@@ -27,24 +39,25 @@ namespace yateto {
   template<unsigned Dim, typename real_t, typename uint_t>
   class TensorView {
   public:
-    explicit TensorView(std::initializer_list<uint_t> shape) {
+#pragma omp declare target
+    YATETO_HOSTDEVICE explicit TensorView(std::initializer_list<uint_t> shape) {
       std::copy(shape.begin(), shape.end(), m_shape);
     }
 
-    explicit TensorView(uint_t const shape[]) {
+    YATETO_HOSTDEVICE explicit TensorView(uint_t const shape[]) {
       for (uint_t d = 0; d < Dim; ++d) {
         m_shape[d] = shape[d];
       }
     }
     
-    constexpr uint_t dim() const {
+    YATETO_HOSTDEVICE static constexpr uint_t dim() {
       return Dim;
     }
 
-    uint_t shape(uint_t dim) const {
+    YATETO_HOSTDEVICE uint_t shape(uint_t dim) const {
       return m_shape[dim];
     }
-
+#pragma omp end declare target
   protected:
     uint_t m_shape[Dim];
   };
@@ -52,36 +65,39 @@ namespace yateto {
   template<typename real_t, typename uint_t>
   class TensorView<0, real_t, uint_t> {
   public:
-    explicit TensorView(std::initializer_list<uint_t> shape) {} 
+#pragma omp declare target
+    YATETO_HOSTDEVICE explicit TensorView(std::initializer_list<uint_t> shape) {} 
 
-    explicit TensorView(uint_t const shape[]) {}
+    YATETO_HOSTDEVICE explicit TensorView(uint_t const shape[]) {}
     
-    constexpr uint_t dim() const {
+    YATETO_HOSTDEVICE static constexpr uint_t dim() {
       return 0;
     }
 
-    uint_t shape(uint_t dim) const {
+    YATETO_HOSTDEVICE uint_t shape(uint_t dim) const {
       return 0;
     }
+#pragma omp end declare target
   };
 
   template<unsigned Dim, typename real_t, typename uint_t=unsigned>
   class DenseTensorView : public TensorView<Dim, real_t, uint_t> {
   public:
-    explicit DenseTensorView(real_t* values, std::initializer_list<uint_t> shape, std::initializer_list<uint_t> start, std::initializer_list<uint_t> stop)
+#pragma omp declare target
+    YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, std::initializer_list<uint_t> shape, std::initializer_list<uint_t> start, std::initializer_list<uint_t> stop)
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values) {
       std::copy(start.begin(), start.end(), m_start);
       std::copy(stop.begin(), stop.end(), m_stop);
       computeStride();
     }
 
-    explicit DenseTensorView(real_t* values, std::initializer_list<uint_t> shape)
+    YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, std::initializer_list<uint_t> shape)
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_start{} {
       std::copy(shape.begin(), shape.end(), m_stop);
       computeStride();
     }
 
-    explicit DenseTensorView(real_t* values, uint_t const shape[], uint_t const start[], uint_t const stop[])
+    YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, uint_t const shape[], uint_t const start[], uint_t const stop[])
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values) {
       for (uint_t d = 0; d < Dim; ++d) {
         m_start[d] = start[d];
@@ -90,7 +106,7 @@ namespace yateto {
       computeStride();
     }
 
-    explicit DenseTensorView(real_t* values, uint_t const shape[])
+    YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, uint_t const shape[])
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_start{} {
       for (uint_t d = 0; d < Dim; ++d) {
         m_stop[d] = shape[d];
@@ -98,7 +114,7 @@ namespace yateto {
       computeStride();
     }
  
-    explicit DenseTensorView(real_t* values, uint_t const shape[], uint_t const stride[])
+    YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, uint_t const shape[], uint_t const stride[])
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_start{} {
       for (uint_t d = 0; d < Dim; ++d) {
         m_stop[d] = shape[d];
@@ -106,11 +122,11 @@ namespace yateto {
       }
     }
 
-    uint_t size() const {
+    YATETO_HOSTDEVICE uint_t size() const {
       return (m_stop[Dim-1]-m_start[Dim-1]) * m_stride[Dim-1];
     }
     
-    void setZero() {
+    YATETO_HOSTDEVICE void setZero() {
       uint_t entry[Dim];
       std::copy(m_start, m_start + Dim, entry);
       while (entry[Dim-1] != m_stop[Dim-1]) {
@@ -132,26 +148,26 @@ namespace yateto {
     }
 
     template<typename Head>
-    bool isInRange(const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head) const {
+    YATETO_HOSTDEVICE bool isInRange(const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head) const {
       return static_cast<uint_t>(head) >= start[dim] && static_cast<uint_t>(head) < stop[dim];
     }
 
     template<typename Head, typename... Tail>
-    bool isInRange(const uint_t  start[Dim], const uint_t stop[Dim], int dim, Head head, Tail... tail) const {
+    YATETO_HOSTDEVICE bool isInRange(const uint_t  start[Dim], const uint_t stop[Dim], int dim, Head head, Tail... tail) const {
       return static_cast<uint_t>(head) >= start[dim]
              && static_cast<uint_t>(head) < stop[dim]
              && isInRange(start, stop, dim+1, tail...);
     }
 
     template<typename... Entry>
-    bool isInRange(Entry... entry) const {
+    YATETO_HOSTDEVICE bool isInRange(Entry... entry) const {
       static_assert(sizeof...(entry) == Dim,
                   "Number of arguments to isInRange(...) does not match the tensor dimension.");
       return isInRange(m_start, m_stop, 0, entry...);
     }
 
     template<typename... Entry>
-    real_t& operator()(Entry... entry) {
+    YATETO_HOSTDEVICE real_t& operator()(Entry... entry) {
       static_assert(sizeof...(entry) == Dim,
                         "Number of arguments to operator() does not match the tensor dimension.");
       assert(isInRange(entry...));
@@ -159,14 +175,14 @@ namespace yateto {
     }
 
     template<typename... Entry>
-    real_t operator()(Entry... entry) const {
+    YATETO_HOSTDEVICE real_t operator()(Entry... entry) const {
       static_assert(sizeof...(entry) == Dim,
                         "Number of arguments to operator() const does not match the tensor dimension.");
       assert(isInRange(entry...));
       return m_values[address(entry...)];
     }
 
-    real_t operator[](uint_t const entry[Dim]) const {
+    YATETO_HOSTDEVICE real_t operator[](uint_t const entry[Dim]) const {
       uint_t addr = 0;
       for (uint_t d = 0; d < Dim; ++d) {
         assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]);
@@ -175,7 +191,7 @@ namespace yateto {
       return m_values[addr];
     }
 
-    real_t& operator[](uint_t const entry[Dim]) {
+    YATETO_HOSTDEVICE real_t& operator[](uint_t const entry[Dim]) {
       uint_t addr = 0;
       for (uint_t d = 0; d < Dim; ++d) {
         assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]);
@@ -185,7 +201,7 @@ namespace yateto {
     }
 
     template<class view_t>
-    void copyToView(view_t& other) const {
+    YATETO_HOSTDEVICE void copyToView(view_t& other) const {
       assert(Dim == other.dim());
       
       uint_t entry[Dim];
@@ -217,7 +233,7 @@ namespace yateto {
     }
 
     template<typename... Entry>
-    auto subtensor(Entry... entry) -> DenseTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t> const {
+    YATETO_HOSTDEVICE auto subtensor(Entry... entry) -> DenseTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t> const {
       static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension.");
       constexpr auto nSlices = count_slices<uint_t, Entry...>::value;
       uint_t begin[Dim];
@@ -228,16 +244,16 @@ namespace yateto {
       return subtensor;
     }
 
-    real_t* data() {
+    YATETO_HOSTDEVICE real_t* data() {
       return m_values;
     }
 
-    const real_t* data() const {
+    YATETO_HOSTDEVICE const real_t* data() const {
       return m_values;
     }
 
   protected:
-    void computeStride() {
+    YATETO_HOSTDEVICE void computeStride() {
       m_stride[0] = 1;
       for (uint_t d = 0; d < Dim-1; ++d) {
         m_stride[d+1] = m_stride[d] * (m_stop[d] - m_start[d]);
@@ -245,26 +261,26 @@ namespace yateto {
     }
 
     template<typename Head>
-    uint_t address(Head head) const {
+    YATETO_HOSTDEVICE uint_t address(Head head) const {
       assert(static_cast<uint_t>(head) >= m_start[Dim-1] && static_cast<uint_t>(head) < m_stop[Dim-1]);
       return (head - m_start[Dim-1]) * m_stride[Dim-1];
     }
 
     template<typename Head, typename... Tail>
-    uint_t address(Head head, Tail... tail) const {
+    YATETO_HOSTDEVICE uint_t address(Head head, Tail... tail) const {
       uint_t const d = (Dim-1) - sizeof...(tail);
       assert(static_cast<uint_t>(head) >= m_start[d] && static_cast<uint_t>(head) < m_stop[d]);
       return (head - m_start[d]) * m_stride[d] + address(tail...);
     }
 
     template<typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-    void extractDim(uint_t*& begin, uint_t*&, uint_t*&, uint_t dimNo, T entry) const {
+    YATETO_HOSTDEVICE void extractDim(uint_t*& begin, uint_t*&, uint_t*&, uint_t dimNo, T entry) const {
       assert(static_cast<uint_t>(entry) >= m_start[dimNo] && static_cast<uint_t>(entry) < m_stop[dimNo]);
       *begin++ = entry;
     }
 
     template<typename T, typename std::enable_if<std::is_same<T, slice<uint_t>>::value, int>::type = 0>
-    void extractDim(uint_t*& begin, uint_t*& size, uint_t*& stride, uint_t dimNo, T dim) const {
+    YATETO_HOSTDEVICE void extractDim(uint_t*& begin, uint_t*& size, uint_t*& stride, uint_t dimNo, T dim) const {
       *begin = std::max(m_start[dimNo], dim.start);
       *size++ = std::min(m_stop[dimNo], dim.stop) - *begin;
       ++begin;
@@ -272,17 +288,18 @@ namespace yateto {
     }
 
     template<typename Head>
-    void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head) const {
+    YATETO_HOSTDEVICE void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head) const {
       extractDim<Head>(begin, size, stride, Dim-1, head);
     }
 
     template<typename Head, typename... Tail>
-    void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head, Tail... tail) const {
+    YATETO_HOSTDEVICE void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head, Tail... tail) const {
       uint_t const d = (Dim-1) - sizeof...(tail);
       extractDim<Head>(begin, size, stride, d, head);
       extractSubtensor(begin, size, stride, tail...);
     }
 
+#pragma omp end declare target
     real_t* m_values;
     uint_t m_start[Dim];
     uint_t m_stop[Dim];
@@ -292,23 +309,24 @@ namespace yateto {
   template<typename real_t, typename uint_t>
   class DenseTensorView<0,real_t,uint_t> : public TensorView<0, real_t, uint_t> {
   public:
-    explicit DenseTensorView(real_t* values, std::initializer_list<uint_t> shape, std::initializer_list<uint_t> start, std::initializer_list<uint_t> stop)
+#pragma omp declare target
+    YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, std::initializer_list<uint_t> shape, std::initializer_list<uint_t> start, std::initializer_list<uint_t> stop)
       : TensorView<0, real_t, uint_t>(shape), m_values(values) {
     }
 
-    uint_t size() const {
+    YATETO_HOSTDEVICE uint_t size() const {
       return 1;
     }
 
-    void setZero() {
+    YATETO_HOSTDEVICE void setZero() {
       m_values[0] = 0.0;
     }
 
     template<class view_t>
-    void copyToView(view_t& other) {
+    YATETO_HOSTDEVICE void copyToView(view_t& other) {
       other.m_values[0] = m_values[0];
     }
-
+#pragma omp end declare target
   protected:
     real_t* m_values;
   };
@@ -316,23 +334,24 @@ namespace yateto {
   template<typename real_t, typename uint_t>
   class CSCMatrixView : public TensorView<2, real_t, uint_t> {
   public:
-    explicit CSCMatrixView(real_t* values, std::initializer_list<uint_t> shape, uint_t const* rowInd, uint_t const* colPtr)
+#pragma omp declare target
+    YATETO_HOSTDEVICE explicit CSCMatrixView(real_t* values, std::initializer_list<uint_t> shape, uint_t const* rowInd, uint_t const* colPtr)
       : TensorView<2, real_t, uint_t>(shape), m_values(values), m_rowInd(rowInd), m_colPtr(colPtr) {
     }
 
-    explicit CSCMatrixView(real_t* values, uint_t const shape[], uint_t const* rowInd, uint_t const* colPtr)
+    YATETO_HOSTDEVICE explicit CSCMatrixView(real_t* values, uint_t const shape[], uint_t const* rowInd, uint_t const* colPtr)
       : TensorView<2, real_t, uint_t>(shape), m_values(values), m_rowInd(rowInd), m_colPtr(colPtr) {
     }
 
-    uint_t size() const {
+    YATETO_HOSTDEVICE uint_t size() const {
       return m_colPtr[ this->shape(1) ];
     }
 
-    void setZero() {
+    YATETO_HOSTDEVICE void setZero() {
       memset(m_values, 0, size() * sizeof(real_t));
     }
 
-    real_t operator()(uint_t row, uint_t col) const {
+    YATETO_HOSTDEVICE real_t operator()(uint_t row, uint_t col) const {
       assert(col >= 0 && col < this->shape(1));
       uint_t addr = m_colPtr[ col ];
       uint_t stop = m_colPtr[ col+1 ];
@@ -347,7 +366,7 @@ namespace yateto {
       return m_values[addr];
     }
 
-    real_t& operator()(uint_t row, uint_t col) {
+    YATETO_HOSTDEVICE real_t& operator()(uint_t row, uint_t col) {
       assert(col >= 0 && col < this->shape(1));
       uint_t addr = m_colPtr[ col ];
       uint_t stop = m_colPtr[ col+1 ];
@@ -362,7 +381,7 @@ namespace yateto {
       return m_values[addr];
     }
 
-    bool isInRange(uint_t row, uint_t col) const {
+    YATETO_HOSTDEVICE bool isInRange(uint_t row, uint_t col) const {
       assert(col >= 0 && col < this->shape(1));
       uint_t addr = m_colPtr[ col ];
       uint_t stop = m_colPtr[ col+1 ];
@@ -376,16 +395,16 @@ namespace yateto {
       return false;
     }
 
-    real_t& operator[](const uint_t entry[2]) {
+    YATETO_HOSTDEVICE real_t& operator[](const uint_t entry[2]) {
       return operator()(entry[0], entry[1]);
     }
 
-    real_t operator[](const uint_t entry[2]) const {
+    YATETO_HOSTDEVICE real_t operator[](const uint_t entry[2]) const {
       return operator()(entry[0], entry[1]);
     }
 
     template<class view_t>
-    void copyToView(view_t& other) {
+    YATETO_HOSTDEVICE void copyToView(view_t& other) {
       assert(2 == other.dim());
       assert(this->shape(0) == other.shape(0) && this->shape(1) == other.shape(1));
 
@@ -399,7 +418,7 @@ namespace yateto {
         }
       }
     }
-
+#pragma omp end declare target
   protected:
     real_t* m_values;
     uint_t const* m_rowInd;
diff --git a/yateto/arch.py b/yateto/arch.py
index a19bc58..ffd3be5 100644
--- a/yateto/arch.py
+++ b/yateto/arch.py
@@ -111,7 +111,35 @@ def onHeap(self, numReals):
   
   def __eq__(self, other):
     return self.name == other.name
-
+  
+  def indexing(self):
+    if self.backend in ['cuda', 'hip']:
+      return ('threadIdx.x', 'blockDim.x')
+    elif self.backend in ['omptarget']:
+      return ('omp_get_thread_num()', 'omp_get_num_threads()')
+    elif self.backend in ['oneapi', 'hipsycl', 'acpp']:
+      return ('item->get_local_id(0)', 'item->get_group().get_group_id(0)')
+    else:
+      raise NotImplementedError('Inline GPU kernel indexing not yet implemented for this type of backend')
+
+  def barrier(self):
+    # TODO: maybe also grid-wise syncs?
+    if self.backend in ['cuda', 'hip']:
+      return '__syncthreads();'
+    elif self.backend in ['omptarget']:
+      return '#pragma omp barrier'
+    elif self.backend in ['oneapi', 'hipsycl', 'acpp']:
+      return 'item->barrier(sycl::access::fence_space::local_space);'
+    else:
+      raise NotImplementedError('Inline GPU kernel barriers are not yet implemented for this type of backend')
+  
+  def headers(self):
+    if self.backend in ['cpp', 'cuda']:
+      return []
+    elif self.backend in ['hip']:
+      return ['hip/hip_runtime.h']
+    elif self.backend in ['oneapi', 'hipsycl', 'acpp']:
+      return ['sycl/sycl.hpp']
 
 def _get_name_and_precision(ident):
   return ident[1:], ident[0].upper()
diff --git a/yateto/codegen/common.py b/yateto/codegen/common.py
index e267494..2d5803c 100644
--- a/yateto/codegen/common.py
+++ b/yateto/codegen/common.py
@@ -50,7 +50,7 @@ def fromVar(cls, var, indices):
         is_const = var.tensor.is_compute_constant()
     return cls(str(var), indices, var.memoryLayout(), var.eqspp(), is_const, var.is_temporary)
 
-def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', indexNo=None):
+def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', indexNo=None, indexer=None):
   flops = 0
   if indexNo == None:
     indexNo = len(indexNames)-1
@@ -59,10 +59,16 @@ def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', indexNo
   else:
     index = indexNames[indexNo]
     rng = ranges[index]
+    iterstart = rng.start
+    increment = 1
     if pragmaSimd and indexNo == 0:
-      cpp('#pragma omp simd')
-    with cpp.For('int {3}{0} = {1}; {3}{0} < {2}; ++{3}{0}'.format(index, rng.start, rng.stop, prefix)):
-      flops = forLoops(cpp, indexNames, ranges, body, pragmaSimd, prefix, indexNo-1)
+      if indexer is None:
+        cpp('#pragma omp simd')
+      else:
+        iterstart = f'{rng.start} + {indexer[0]}'
+        increment = f'{indexer[1]}'
+    with cpp.For('int {3}{0} = {1}; {3}{0} < {2}; {3}{0} += {4}'.format(index, iterstart, rng.stop, prefix, increment)):
+      flops = forLoops(cpp, indexNames, ranges, body, pragmaSimd, prefix, indexNo-1, indexer)
     flops = flops * rng.size()
   return flops
   
diff --git a/yateto/codegen/copyscaleadd/factory.py b/yateto/codegen/copyscaleadd/factory.py
index dfbeafb..61f7b72 100644
--- a/yateto/codegen/copyscaleadd/factory.py
+++ b/yateto/codegen/copyscaleadd/factory.py
@@ -42,4 +42,4 @@ def generator(arch, descr, gemm_cfg, target):
           return CopyScaleAddGenerator(arch, descr)
       else:
           raise NotImplementedError(f'no implementation found for {target} target')
-  return Generic(arch, descr)
+  return Generic(arch, descr, target)
diff --git a/yateto/codegen/copyscaleadd/generic.py b/yateto/codegen/copyscaleadd/generic.py
index 44bf704..91fa35b 100644
--- a/yateto/codegen/copyscaleadd/generic.py
+++ b/yateto/codegen/copyscaleadd/generic.py
@@ -1,9 +1,10 @@
 from ..common import *
 
 class Generic(object):
-  def __init__(self, arch, descr):
+  def __init__(self, arch, descr, target):
     self._arch = arch
     self._descr = descr
+    self._target = target
   
   def _formatTerm(self, alpha, term):
     prefix = ''
@@ -42,4 +43,5 @@ def __call__(s):
 
         return flop
 
-    return forLoops(cpp, d.result.indices, d.loopRanges, CopyScaleAddBody())
+    indexer = self._arch.indexing() if self._target == 'igpu' else None
+    return forLoops(cpp, d.result.indices, d.loopRanges, CopyScaleAddBody(), indexer=indexer)
diff --git a/yateto/codegen/factory.py b/yateto/codegen/factory.py
index b07ee8f..8ea3144 100644
--- a/yateto/codegen/factory.py
+++ b/yateto/codegen/factory.py
@@ -25,7 +25,7 @@ def generic_create(self, node, *args):
   def simple(self, result, term, add, scalar, routineCache, gemm_cfg):
     raise NotImplementedError
 
-  def temporary(self, bufname, size, iniZero=False, memory=list()):
+  def temporary(self, bufname, size, offset, iniZero=False, memory=list()):
     assert(iniZero == False or len(memory) == 0)
 
     if self._target == 'cpu':
@@ -52,6 +52,13 @@ def temporary(self, bufname, size, iniZero=False, memory=list()):
         elif memory:
           ini = ' = {{{}}}'.format(', '.join(memory))
         self._cpp(f'alignas({self._arch.alignment}) {self._arch.typename} {bufname}[{size}] {ini};')
+    elif self._target == 'igpu':
+      self._cpp(f'{self._arch.typename}* {bufname} = &sharedMemory[{offset}];')
+      if iniZero:
+        self._cpp.memset(bufname, size, self._arch.typename)
+      if memory:
+        for i, data in enumerate(memory):
+          self._cpp(f'{bufname}[{i}] = {data};')
     else:
       declaration = f'{self._arch.typename}* {bufname}'
       total_size = f'{BatchedOperationsAux.NUM_ELEMENTS_NAME} * {size}'
@@ -59,7 +66,7 @@ def temporary(self, bufname, size, iniZero=False, memory=list()):
 
 
   def freeTmp(self):
-    if self._target == 'cpu':
+    if self._target in ['cpu', 'igpu']:
       for free in self._freeList:
         self._cpp(f'free({free});')
     elif self._target == 'gpu':
@@ -70,7 +77,7 @@ def freeTmp(self):
     self._freeList = []
 
   def reset_stream(self):
-    if self._target == 'cpu':
+    if self._target in ['cpu', 'igpu']:
       pass
     elif self._target == 'gpu':
       self._cpp(f'{BatchedOperationsAux.STREAM_PTR_NAME} = {BatchedOperationsAux.FORBIDDEN_STREAM_PTR};')
@@ -78,7 +85,7 @@ def reset_stream(self):
       raise RuntimeError('unknown compute target')
 
   def reset_flags(self):
-    if self._target == 'cpu':
+    if self._target in ['cpu', 'igpu']:
       pass
     elif self._target == 'gpu':
       self._cpp(f'{BatchedOperationsAux.FLAGS_NAME} = nullptr;')
@@ -250,7 +257,7 @@ def tensor(self, node, resultName, maxValue = 512):
     spp = node.spp()
     isDense = spp.count_nonzero() == size
     if isDense:
-      self.temporary(resultName, size)
+      self.temporary(resultName, size, 0)
       with self._cpp.For('int i = 0; i < {}; ++i'.format(size)):
         self._cpp('{}[i] = static_cast<{}>((i + {}) % {} + 1);'.format(resultName, self._arch.typename, self._rand, maxValue))
     else:
@@ -259,7 +266,7 @@ def tensor(self, node, resultName, maxValue = 512):
       for entry in zip(*nz):
         addr = ml.address(entry)
         memory[addr] = str(float((addr + self._rand) % maxValue)+1.0)
-      self.temporary(resultName, size, memory=memory)
+      self.temporary(resultName, size, 0, memory=memory)
     self._rand += 1
 
 
diff --git a/yateto/codegen/gemm/factory.py b/yateto/codegen/gemm/factory.py
index a2a98c0..1378622 100644
--- a/yateto/codegen/gemm/factory.py
+++ b/yateto/codegen/gemm/factory.py
@@ -88,4 +88,4 @@ def generator(arch, descr, gemm_cfg, target):
                                     target)
     if gemmTool:
       return GemmGen(arch, descr, gemmTool)
-  return Generic(arch, descr)
+  return Generic(arch, descr, target)
diff --git a/yateto/codegen/gemm/generic.py b/yateto/codegen/gemm/generic.py
index 3b8537e..e2b1f06 100644
--- a/yateto/codegen/gemm/generic.py
+++ b/yateto/codegen/gemm/generic.py
@@ -4,8 +4,10 @@ class Generic(object):
   OUTER_INDEX = 'o'
   INNER_INDEX = 'i'
 
-  def __init__(self, arch, descr):
+  def __init__(self, arch, descr, target):
+    self._arch = arch
     self._descr = descr
+    self._target = target
 
   def _flopInit(self, beta):
     return 0 if beta in [0.0, 1.0] else 1
@@ -77,7 +79,15 @@ def _generateSparseDense(self, cpp):
       sizes = {0: k.size(), 1: n.size(), self.OUTER_INDEX: m.size(), self.INNER_INDEX: n.size()}
       trans = d.transB
 
-    with cpp.For('int {0} = 0; {0} < {1}; ++{0}'.format(self.OUTER_INDEX, sizes[self.OUTER_INDEX])):
+    if self._target == 'igpu':
+      indexing = self._arch.indexing()
+      iterstart = indexing[0]
+      iterincr = indexing[1]
+    else:
+      iterstart = 0
+      iterincr = 1
+    
+    with cpp.For(f'int {self.OUTER_INDEX} = {iterstart}; {self.OUTER_INDEX} < {sizes[self.OUTER_INDEX]}; {self.OUTER_INDEX} += {iterincr}'):
       if d.beta != 1.0:
         with cpp.For('int {0} = 0; {0} < {1}; ++{0}'.format(self.INNER_INDEX, sizes[self.INNER_INDEX])):
           CAddr = result([self.INNER_INDEX, self.INNER_INDEX])
@@ -107,10 +117,18 @@ def _generateDenseDense(self, cpp):
     Aaccess = self._accessFun(d.leftTerm, (m.start, k.start), False, d.transA)
     Baccess = self._accessFun(d.rightTerm, (k.start, n.start), False, d.transB)
     Caccess = self._accessFun(d.result, (m.start, n.start), False, False)
+
+    if self._target == 'igpu':
+      indexing = self._arch.indexing()
+      mstart = indexing[0]
+      minc = indexing[1]
+    else:
+      mstart = 0
+      minc = 1
     
     with cpp.For('int n = 0; n < {0}; ++n'.format(n.size())):
       if d.beta != 1.0:
-        with cpp.For('int m = 0; m < {0}; ++m'.format(m.size())):
+        with cpp.For(f'int m = {mstart}; m < {m.size()}; m += {minc}'):
           cpp('{} = {}{};'.format(
               Caccess('m', 'n'),
               d.beta,
@@ -118,7 +136,7 @@ def _generateDenseDense(self, cpp):
             )
           )
       with cpp.For('int k = 0; k < {0}; ++k'.format(k.size())):
-        with cpp.For('int m = 0; m < {0}; ++m'.format(m.size())):
+        with cpp.For(f'int m = {mstart}; m < {m.size()}; m += {minc}'):
           cpp( '{C} += {alpha} * {A} * {B};'.format(
               C = Caccess('m', 'n'),
               alpha = d.alpha,
diff --git a/yateto/codegen/indexsum/factory.py b/yateto/codegen/indexsum/factory.py
index 48e3423..2ef6311 100644
--- a/yateto/codegen/indexsum/factory.py
+++ b/yateto/codegen/indexsum/factory.py
@@ -21,7 +21,7 @@ def __init__(self, alpha, add: bool, result: IndexedTensorDescription, term: Ind
     
 
 def generator(arch, descr, target):
-  if target == 'cpu':
+  if target in ['cpu', 'igpu']:
     return Generic(arch, descr)
   elif target == 'gpu':
     raise RuntimeError("IndexSum operation has not been implemented for GPU-like architectures")
\ No newline at end of file
diff --git a/yateto/codegen/indexsum/generic.py b/yateto/codegen/indexsum/generic.py
index 36df9f5..eae1f5c 100644
--- a/yateto/codegen/indexsum/generic.py
+++ b/yateto/codegen/indexsum/generic.py
@@ -1,9 +1,10 @@
 from ..common import *
 
 class Generic(object):
-  def __init__(self, arch, descr):
+  def __init__(self, arch, descr, target):
     self._arch = arch
     self._descr = descr
+    self._target = target
 
   def generate(self, cpp, routineCache):
     d = self._descr
@@ -27,4 +28,5 @@ def __call__(s):
         flop = 1 if d.alpha != 1.0 else 0
         return d.sumLoopRange.size() + flop
 
-    return forLoops(cpp, d.result.indices, d.loopRanges, IndexSumBody())
+    indexer = self._arch.indexing() if self._target == 'igpu' else None
+    return forLoops(cpp, d.result.indices, d.loopRanges, IndexSumBody(), indexer=indexer)
diff --git a/yateto/codegen/product/factory.py b/yateto/codegen/product/factory.py
index 8a85366..5699d28 100644
--- a/yateto/codegen/product/factory.py
+++ b/yateto/codegen/product/factory.py
@@ -25,8 +25,8 @@ def __init__(self, alpha, add: bool, result: IndexedTensorDescription, leftTerm:
     self.loopRanges = rA    
 
 def generator(arch, descr, target):
-  if target == 'cpu':
-    return Generic(arch, descr)
+  if target in ['cpu', 'igpu']:
+    return Generic(arch, descr, target)
   elif target == 'gpu':
     raise RuntimeError("Product operation has not been implemented for GPU-like architectures")
 
diff --git a/yateto/codegen/product/generic.py b/yateto/codegen/product/generic.py
index 5cbba15..205c758 100644
--- a/yateto/codegen/product/generic.py
+++ b/yateto/codegen/product/generic.py
@@ -1,9 +1,10 @@
 from ..common import *
 
 class Generic(object):
-  def __init__(self, arch, descr):
+  def __init__(self, arch, descr, target):
     self._arch = arch
     self._descr = descr
+    self._target = target
 
   def _mult(self, alpha):
     return '{} * '.format(alpha) if alpha != 1.0 else ''
@@ -36,7 +37,8 @@ def __call__(s):
         )
         return self._flop(d.add, d.alpha)
 
-    return forLoops(cpp, d.result.indices, d.loopRanges, ProductBody())
+    indexer = self._arch.indexing() if self._target == 'igpu' else None
+    return forLoops(cpp, d.result.indices, d.loopRanges, ProductBody(), indexer=indexer)
 
   def _generateSparseDense(self, cpp):
     raise NotImplementedError
diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py
index f0e67f5..49ea148 100644
--- a/yateto/codegen/visitor.py
+++ b/yateto/codegen/visitor.py
@@ -73,19 +73,42 @@ def generate(self, cpp, cfg, factory,  routineCache, gemm_cfg):
     # NOTE: it is required to know in case if the memory is allocated on the heap
     #       an provided by the user
     required_tmp_mem = 0
+    memoffset = 0
     cfg = DetermineLocalInitialization().visit(cfg)
     localPtrs = set()
     for pp in cfg:
       localPtrs.update(pp.bufferMap.keys())
     if localPtrs:
       cpp( '{}{};'.format(self._arch.typename, ','.join(map(lambda x: ' *' + str(x), localPtrs))) )
+    
+    if len(cfg) > 0:
+      last = cfg[-1]
+      first = cfg[0]
+    else:
+      first = None
+      last = None
+    
+    used = set()
+
+    # TODO: rework synchronization
     for pp in cfg:
       for buf, size in pp.initBuffer.items():
         required_tmp_mem += size * self._arch.bytesPerReal
         bufname = self._bufferName(buf)
-        factory.temporary(bufname, size)
+        factory.temporary(bufname, size, memoffset)
+        memoffset += size
+      
+      needsBarrier = False
       for local, buf in pp.bufferMap.items():
-        cpp('{} = {};'.format(local, self._bufferName(buf)))
+        bufname = self._bufferName(buf)
+        cpp('{} = {};'.format(local, bufname))
+        if bufname in used:
+          needsBarrier = True
+        used.add(bufname)
+      
+      if needsBarrier or (pp.action and pp.action.result.is_temporary and not first):
+        cpp(self._arch.barrier())
+      
       action = pp.action
       if action:
         scalar = self.deduce_scalar(action)
@@ -94,6 +117,9 @@ def generate(self, cpp, cfg, factory,  routineCache, gemm_cfg):
           hwFlops += factory.create(action.term.node, action.result, action.term.variableList(), action.add, scalar, prefetchName, routineCache, gemm_cfg)
         else:
           hwFlops += factory.simple(action.result, action.term, action.add, scalar, routineCache, gemm_cfg)
+        
+        if action.result.is_temporary and pp is not last:
+          cpp(self._arch.barrier())
     return hwFlops, required_tmp_mem
 
 class OptimisedKernelGenerator(KernelGenerator):
@@ -324,6 +350,10 @@ def generate_extra_offset_args(base_name_with_namespace, groups):
 
           for base_name, groups in tensors.items():
             generate_extra_offset_args(base_name, groups)
+        elif target == 'igpu':
+          header(f'{self._arch.typename}* sharedMemory = nullptr;')
+          if self._arch.backend in ['oneapi', 'acpp', 'hipsycl']:
+            header('sycl::nd_item<1>* item;')
         header.emptyline()
 
         if len(prefetch) > 0:
@@ -332,10 +362,14 @@ def generate_extra_offset_args(base_name_with_namespace, groups):
               kernelArgs(baseName, groups, writable=False, is_constant=False, target='any')
           header('{} {};'.format(self.PREFETCHSTRUCT_NAME, self.PREFETCHVAR_NAME))
           header.emptyline()
+        
+        hostdevice = 'YATETO_DEVICE' if target == 'igpu' else ''
 
         for index, kernelOutline in enumerate(kernelOutlines):
           if kernelOutline:
-            header.functionDeclaration(executeName(index))
+            header('#pragma omp declare target')
+            header.functionDeclaration(executeName(index), returnType=f'{hostdevice} void')
+            header('#pragma omp end declare target')
 
         if familyStride is not None:
           header('using {} = void ({}::*)();'.format(self.MEMBER_FUNCTION_PTR_NAME, name))
@@ -349,8 +383,14 @@ def generate_extra_offset_args(base_name_with_namespace, groups):
           indexF = indexFun(familyStride)
           with header.Function(self.FIND_EXECUTE_NAME, args, '{} {}'.format(MODIFIERS, self.MEMBER_FUNCTION_PTR_NAME)):
             header('return {}[{}];'.format(self.EXECUTE_ARRAY_NAME, indexF))
-          with header.Function(self.EXECUTE_NAME, args, '{} void'.format(INLINE)):
-            header('(this->*{}({}))();'.format(self.FIND_EXECUTE_NAME, ', '.join(ndargs(len(familyStride)))))
+          with header.Function(self.EXECUTE_NAME, args, f'{hostdevice} {INLINE} void'):
+            if target == 'igpu':
+              header('const auto indexF = {indexF};')
+              for index, kernelOutline in enumerate(kernelOutline):
+                with header.If(f'(indexF == {index})'):
+                  header(f'{executeName(index)}();')
+            else:
+              header('(this->*{}({}))();'.format(self.FIND_EXECUTE_NAME, ', '.join(ndargs(len(familyStride)))))
 
           aux_functions = [self.NONZEROFLOPS_NAME, self.HARDWAREFLOPS_NAME, self.TEMP_MEM_REQUIRED_NAME]
           for function in aux_functions:
@@ -379,28 +419,31 @@ def generate_extra_offset_args(base_name_with_namespace, groups):
     for index, kernelOutline in enumerate(kernelOutlines):
       if kernelOutline is None:
         continue
+      
+      fileh = header if target == 'igpu' else cpp
 
-      with cpp.Function('{}::{}::{}'.format(self.NAMESPACE, name, executeName(index))):
-        for base_name_with_namespace, groups in kernelOutline.scalars.items():
-          base_name = Tensor.splitBasename(base_name_with_namespace)[-1]
-          if len(next(iter(groups))) > 0:
-            for gis in groups:
-              cpp('assert(!std::isnan({}({})));'.format(base_name, ','.join(str(gi) for gi in gis)))
-          else:
-            cpp(f'assert(!std::isnan({base_name}));')
-        for base_name_with_namespace, groups in kernelOutline.tensors.items():
-          base_name = Tensor.splitBasename(base_name_with_namespace)[-1]
-          if len(next(iter(groups))) > 0:
-            for gis in groups:
-              cpp('assert({}({}) != nullptr);'.format(base_name, ','.join(str(gi) for gi in gis)))
-          else:
-            cpp(f'assert({base_name} != nullptr);')
+      with fileh.Function('{}::{}::{}'.format(self.NAMESPACE, name, executeName(index)), returnType=f'{hostdevice} void'):
+        if target != 'igpu':
+          for base_name_with_namespace, groups in kernelOutline.scalars.items():
+            base_name = Tensor.splitBasename(base_name_with_namespace)[-1]
+            if len(next(iter(groups))) > 0:
+              for gis in groups:
+                fileh('assert(!std::isnan({}({})));'.format(base_name, ','.join(str(gi) for gi in gis)))
+            else:
+              fileh(f'assert(!std::isnan({base_name}));')
+          for base_name_with_namespace, groups in kernelOutline.tensors.items():
+            base_name = Tensor.splitBasename(base_name_with_namespace)[-1]
+            if len(next(iter(groups))) > 0:
+              for gis in groups:
+                fileh('assert({}({}) != nullptr);'.format(base_name, ','.join(str(gi) for gi in gis)))
+            else:
+              fileh(f'assert({base_name} != nullptr);')
 
-        if target == 'gpu':
-          cpp(f'assert({BatchedOperationsAux.NUM_ELEMENTS_NAME} != 0);')
-          cpp(f'assert({BatchedOperationsAux.STREAM_PTR_NAME} != {BatchedOperationsAux.FORBIDDEN_STREAM_PTR});')
+          if target == 'gpu':
+            fileh(f'assert({BatchedOperationsAux.NUM_ELEMENTS_NAME} != 0);')
+            fileh(f'assert({BatchedOperationsAux.STREAM_PTR_NAME} != {BatchedOperationsAux.FORBIDDEN_STREAM_PTR});')
 
-        cpp(kernelOutline.function)
+        fileh(kernelOutline.function)
 
 class UnitTestGenerator(KernelGenerator):
   KERNEL_VAR = 'krnl'
@@ -490,7 +533,7 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, target, gemm_cfg,
         
       for var in variables:
         factory.tensor(var.tensor, self._tensorName(var))
-        factory.temporary(self._name(var), var.memoryLayout().requiredReals(), iniZero=True)
+        factory.temporary(self._name(var), var.memoryLayout().requiredReals(), 0, iniZero=True)
         
         shape = var.memoryLayout().shape()
         cpp('{supportNS}::DenseTensorView<{dim},{arch.typename},{arch.uintTypename}> {viewName}({utName}, {{{shape}}}, {{{start}}}, {{{shape}}});'.format(
@@ -535,7 +578,6 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, target, gemm_cfg,
           cpp( f'{self.QUEUE}.memcpy({self._devPtrTensorName(var)}, &{self._devTensorName(var)}, sizeof({self._arch.typename}*)).wait();' )
         cpp.emptyline()
 
-
       cpp( '{}{}::{} {};'.format(kernel_prefix, OptimisedKernelGenerator.NAMESPACE, kernelClass, self.KERNEL_VAR) )
       for var in scalars:
         cpp( '{}.{}{} = {};'.format(self.KERNEL_VAR, var.baseName(), self._groupIndex(var), self._tensorNameS(var)) )
@@ -746,10 +788,10 @@ def generateTensorsH(self, header):
               header('template<typename T>')
               with header.Struct(self.CONTAINER_CLASS_NAME):
                 header('T {}[{}];'.format(self.CONTAINER_DATA_NAME, reduce(operator.mul, groupSize)))
-                header('{}() : {}{{}} {{}}'.format(self.CONTAINER_CLASS_NAME, self.CONTAINER_DATA_NAME))
-                with header.Function('operator()', typedArgs, '{} T&'.format(INLINE)):
+                header('YATETO_HOSTDEVICE {}() : {}{{}} {{}}'.format(self.CONTAINER_CLASS_NAME, self.CONTAINER_DATA_NAME))
+                with header.Function('operator()', typedArgs, 'YATETO_HOSTDEVICE {} T&'.format(INLINE)):
                   header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args)))
-                with header.Function('operator()', typedArgs, '{} T const&'.format(INLINE), const=True):
+                with header.Function('operator()', typedArgs, 'YATETO_HOSTDEVICE {} T const&'.format(INLINE), const=True):
                   header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args)))
     for namespace, scalar_dict in self.iterate_collect_scalar():
       with header.Namespace(namespace), header.Namespace(self.TENSOR_NAMESPACE):
@@ -850,7 +892,7 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat
           tv = self._tensorViewGenerator(ml)
           with cpp.Struct(self.VIEW_STRUCT_NAME):
             cpp('typedef {} {};'.format(tv.typename(len(ml.shape()), self._arch), self.VIEW_TYPE_NAME))
-            with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgs, returnType='{} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME)):
+            with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgs, returnType='YATETO_HOSTDEVICE {} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME)):
               tv.generate(cpp, ml, self._arch, None)
         else:
           typedArgs = typedNdArgs(len(groupSize), self._arch.uintTypename)
@@ -865,7 +907,7 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat
           cpp('template<>')
           with cpp.Struct('{}::{}<{}>'.format(baseNameWithoutNamespace, self.VIEW_STRUCT_NAME, special)):
             cpp('typedef {} {};'.format(typename, self.VIEW_TYPE_NAME))
-            with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgs, returnType='{} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME)):
+            with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgs, returnType='YATETO_HOSTDEVICE {} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME)):
               tv.generate(cpp, ml, self._arch, index(group))
   
   def _array(self, cpp, typ, name, content, groupSize, declarationOnly=False, alwaysArray=True, constexpr=True, static=True):
diff --git a/yateto/generator.py b/yateto/generator.py
index 49c2559..e5e1d12 100644
--- a/yateto/generator.py
+++ b/yateto/generator.py
@@ -24,7 +24,7 @@
 class Kernel(object):
   BASE_NAME = r'[a-zA-Z]\w*'
   VALID_NAME = r'^{}$'.format(BASE_NAME)
-  VALID_TARGETS = ['cpu', 'gpu']
+  VALID_TARGETS = ['cpu', 'gpu', 'igpu']
 
   def __init__(self, name, ast, prefetch=None, namespace=None, target='cpu'):
     self.name = name
@@ -367,6 +367,10 @@ def unit_test_body(cpp, testFramework):
           header.includeSys('limits')
           header.include('yateto.h')
           header.include(fTensors.hName)
+
+          for path in self._arch.headers():
+            header.includeSys(path)
+
           cpp.include(fKernels.hName)
           with cpp.Namespace(namespace), header.Namespace(namespace):
               # Group kernels by namespace