diff --git a/Bender.lock b/Bender.lock
index 958e1ed..138e86b 100644
--- a/Bender.lock
+++ b/Bender.lock
@@ -71,7 +71,7 @@ packages:
     - common_verification
     - register_interface
   insitu-cache:
-    revision: 04f72a7ac7e9091f1820f0dac59bb778b134d7f7
+    revision: 57c0884166dd0f1b7c484633b437fe11d5d62c89
     version: null
     source:
       Git: https://github.com/pulp-platform/Insitu-Cache.git
diff --git a/config/cachepool_512.mk b/config/cachepool_512.mk
index 8970304..05992d5 100644
--- a/config/cachepool_512.mk
+++ b/config/cachepool_512.mk
@@ -52,7 +52,7 @@ l1d_num_way ?= 4
 l1d_tile_size ?= 256
 
 # L1 data cache tag width (TODO: should be calcualted)
-l1d_tag_data_width ?= 52
+l1d_tag_data_width ?= 92
 
 ####################
 ##  CachePool CC  ##
diff --git a/config/cachepool_fpu_512.mk b/config/cachepool_fpu_512.mk
index c922ef3..c51eccd 100644
--- a/config/cachepool_fpu_512.mk
+++ b/config/cachepool_fpu_512.mk
@@ -54,7 +54,7 @@ l1d_num_way ?= 4
 l1d_tile_size ?= 256
 
 # L1 data cache tag width (TODO: should be calcualted)
-l1d_tag_data_width ?= 52
+l1d_tag_data_width ?= 92
 
 ####################
 ##  CachePool CC  ##
diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv
index 4a943a0..cbc5141 100644
--- a/hardware/src/cachepool_tile.sv
+++ b/hardware/src/cachepool_tile.sv
@@ -415,6 +415,7 @@ module cachepool_tile
   tcdm_user_t [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_req_meta;
   logic       [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_req_write;
   data_t      [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_req_data;
+  strb_t      [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_req_strb;
 
   logic       [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_valid;
   logic       [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_ready;
@@ -433,7 +434,7 @@ module cachepool_tile
   logic            [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_we;
   tcdm_bank_addr_t [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_addr;
   data_t           [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_wdata;
-  logic            [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_be;
+  logic            [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0][DataWidth/8-1:0] l1_data_bank_be;
   data_t           [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_rdata;
   logic            [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_gnt;
 
@@ -630,6 +631,7 @@ module cachepool_tile
         assign cache_req_meta [cb][j] = cache_req_reg.q.user;
         assign cache_req_write[cb][j] = cache_req_reg.q.write;
         assign cache_req_data [cb][j] = cache_req_reg.q.data;
+        assign cache_req_strb [cb][j] = cache_req_reg.q.strb;
 
         assign cache_rsp_reg.p_valid = cache_rsp_valid[cb][j];
         assign cache_rsp_reg.q_ready = cache_req_ready[cb][j];
@@ -646,6 +648,7 @@ module cachepool_tile
         assign cache_req_meta [cb][j] = cache_xbar_req   [j][cb].q.user;
         assign cache_req_write[cb][j] = cache_xbar_req   [j][cb].q.write;
         assign cache_req_data [cb][j] = cache_xbar_req   [j][cb].q.data;
+        assign cache_req_strb [cb][j] = cache_xbar_req   [j][cb].q.strb;
 
         assign cache_xbar_rsp[j][cb].p_valid = cache_rsp_valid[cb][j];
         assign cache_xbar_rsp[j][cb].q_ready = cache_req_ready[cb][j];
@@ -661,6 +664,21 @@ module cachepool_tile
   // For address scrambling
   localparam NumSelBits = $clog2(NumL1CtrlTile);
   localparam NumWordPerLine = L1LineWidth / DataWidth;
+  localparam int unsigned WordBytes = DataWidth / 8;
+  initial begin
+    $display("Cache Configuration:");
+    $display("  NumCtrl        : %0d", NumL1CtrlTile);
+    $display("  LineWidth      : %0d", L1LineWidth);
+    $display("  NumWordPerLine : %0d", NumWordPerLine);
+    $display("  NumSet         : %0d", L1NumSet);
+    $display("  AssoPerCtrl    : %0d", L1AssoPerCtrl);
+    $display("  BankFactor     : %0d", L1BankFactor);
+    $display("  NumTagBankPerCtrl : %0d", NumTagBankPerCtrl);
+    $display("  NumDataBankPerCtrl: %0d", NumDataBankPerCtrl);
+    $display("  CoalFactor     : %0d", L1CoalFactor);
+    $display("  RefillDataWidth: %0d", RefillDataWidth);
+    $display("  DynamicOffset  : %0d", dynamic_offset);
+  end
   logic [SpatzAxiAddrWidth-1:0] bitmask_up, bitmask_lo;
   assign bitmask_lo = (1 << dynamic_offset) - 1;
   // We will keep AddrWidth - Offset - log2(CacheBanks) bits in the upper half, and add back the NumSelBits bits
@@ -679,6 +697,7 @@ module cachepool_tile
       .CoalExtFactor    (L1CoalFactor       ),
       .AddrWidth        (L1AddrWidth        ),
       .WordWidth        (DataWidth          ),
+      .ByteWidth        (8                  ),
       .TagWidth         (L1TagDataWidth     ),
       // Cache
       .NumCacheEntry    (L1NumEntryPerCtrl  ),
@@ -712,6 +731,7 @@ module cachepool_tile
       .core_req_meta_i       (cache_req_meta [cb]            ),
       .core_req_write_i      (cache_req_write[cb]            ),
       .core_req_wdata_i      (cache_req_data [cb]            ),
+      .core_req_wstrb_i      (cache_req_strb [cb]            ),
       // Response
       .core_resp_valid_o     (cache_rsp_valid[cb]            ),
       .core_resp_ready_i     (cache_rsp_ready[cb]            ),
@@ -810,11 +830,19 @@ module cachepool_tile
     end
 
     // TODO: Should we use a single large bank or multiple narrow ones?
-    for (genvar j = 0; j < NumDataBankPerCtrl; j = j+NumWordPerLine) begin : gen_l1_data_banks
+    for (genvar bank = 0; bank < (NumDataBankPerCtrl/NumWordPerLine);
+         bank++) begin : gen_l1_data_banks
+      localparam int unsigned BaseIdx = bank * NumWordPerLine;
+      logic [NumWordPerLine*WordBytes-1:0] bank_be;
+
+      for (genvar w = 0; w < NumWordPerLine; w++) begin : gen_bank_be
+        assign bank_be[w*WordBytes +: WordBytes] = l1_data_bank_be[cb][BaseIdx + w];
+      end
+
       tc_sram_impl #(
         .NumWords   (L1CacheWayEntry/L1BankFactor),
         .DataWidth  (L1LineWidth),
-        .ByteWidth  (DataWidth  ),
+        .ByteWidth  (8          ),
         .NumPorts   (1          ),
         .Latency    (1          ),
         .SimInit    ("zeros"    )
@@ -823,15 +851,15 @@ module cachepool_tile
         .rst_ni (rst_ni                      ),
         .impl_i ('0                          ),
         .impl_o (/* unsed */                 ),
-        .req_i  ( l1_data_bank_req  [cb][j]  ),
-        .we_i   ( l1_data_bank_we   [cb][j]  ),
-        .addr_i ( l1_data_bank_addr [cb][j]  ),
-        .wdata_i( l1_data_bank_wdata[cb][j+:NumWordPerLine]),
-        .be_i   ( l1_data_bank_be   [cb][j+:NumWordPerLine]),
-        .rdata_o( l1_data_bank_rdata[cb][j+:NumWordPerLine])
+        .req_i  ( l1_data_bank_req  [cb][BaseIdx]  ),
+        .we_i   ( l1_data_bank_we   [cb][BaseIdx]  ),
+        .addr_i ( l1_data_bank_addr [cb][BaseIdx]  ),
+        .wdata_i( l1_data_bank_wdata[cb][BaseIdx+:NumWordPerLine]),
+        .be_i   ( bank_be ),
+        .rdata_o( l1_data_bank_rdata[cb][BaseIdx+:NumWordPerLine])
       );
 
-      assign l1_data_bank_gnt[cb][j+:NumWordPerLine] = {NumWordPerLine{1'b1}};
+      assign l1_data_bank_gnt[cb][BaseIdx+:NumWordPerLine] = {NumWordPerLine{1'b1}};
     end
 
     // for (genvar j = 0; j < NumDataBankPerCtrl; j++) begin : gen_l1_data_banks
diff --git a/software/tests/CMakeLists.txt b/software/tests/CMakeLists.txt
index 786c4c5..54013c8 100644
--- a/software/tests/CMakeLists.txt
+++ b/software/tests/CMakeLists.txt
@@ -85,6 +85,7 @@ set(SNITCH_TEST_PREFIX cachepool-)
 ## RLC
 add_spatz_test_zeroParam(spin-lock spin-lock/main.c)
 add_spatz_test_zeroParam(mcs-lock mcs-lock/main.c)
+add_spatz_test_zeroParam(byte-enable byte-enable/main.c)
 
 # add_snitch_test(multi_producer_single_consumer_double_linked_list multi_producer_single_consumer_double_linked_list/main.c)
 # add_spatz_test_threeParam(multi_producer_single_consumer_double_linked_list multi_producer_single_consumer_double_linked_list/main.c 1 1350 1000)
diff --git a/software/tests/byte-enable/main.c b/software/tests/byte-enable/main.c
new file mode 100644
index 0000000..e0e37b6
--- /dev/null
+++ b/software/tests/byte-enable/main.c
@@ -0,0 +1,455 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Zexin Fu     <zexifu@iis.ee.ethz.ch>
+
+#include <snrt.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <l1cache.h>
+#include <benchmark.h>
+#include "printf.h"
+#ifdef DATAHEADER
+#include DATAHEADER
+#endif
+
+#define L1LineWidth (512 / 8) // 512 bits
+#define BUF_LINES 2
+#define BUF_BYTES (L1LineWidth * BUF_LINES)
+
+#ifndef ENABLE_SCALAR_TESTS
+#define ENABLE_SCALAR_TESTS 1
+#endif
+
+#ifndef ENABLE_VECTOR_TESTS
+#define ENABLE_VECTOR_TESTS 1
+#endif
+
+#define VEC_E8_LEN 16U
+#define VEC_E16_LEN 8U
+#define VEC_BUF_BYTES 256U
+
+static uint8_t test_buf[BUF_BYTES] __attribute__((aligned(L1LineWidth)))
+    __attribute__((section(".data")));
+
+static uint8_t vec_buf[VEC_BUF_BYTES] __attribute__((aligned(64)))
+    __attribute__((section(".data")));
+static uint8_t vec_data8[VEC_E8_LEN] __attribute__((aligned(4)))
+    __attribute__((section(".data")));
+static uint16_t vec_data16[VEC_E16_LEN] __attribute__((aligned(4)))
+    __attribute__((section(".data")));
+static uint8_t vec_idx8[VEC_E8_LEN] __attribute__((aligned(4)))
+    __attribute__((section(".data")));
+static uint16_t vec_idx16[VEC_E16_LEN] __attribute__((aligned(4)))
+    __attribute__((section(".data")));
+
+static inline void store_b(void *addr, uint8_t value) {
+  asm volatile("sb %0, 0(%1)" :: "r"(value), "r"(addr) : "memory");
+}
+
+static inline void store_h(void *addr, uint16_t value) {
+  asm volatile("sh %0, 0(%1)" :: "r"(value), "r"(addr) : "memory");
+}
+
+static inline void store_w(void *addr, uint32_t value) {
+  asm volatile("sw %0, 0(%1)" :: "r"(value), "r"(addr) : "memory");
+}
+
+
+static inline int32_t load_b(const void *addr) {
+  int32_t out;
+  asm volatile("lb %0, 0(%1)" : "=r"(out) : "r"(addr) : "memory");
+  return out;
+}
+
+static inline int32_t load_h(const void *addr) {
+  int32_t out;
+  asm volatile("lh %0, 0(%1)" : "=r"(out) : "r"(addr) : "memory");
+  return out;
+}
+
+static inline int32_t load_w(const void *addr) {
+  int32_t out;
+  asm volatile("lw %0, 0(%1)" : "=r"(out) : "r"(addr) : "memory");
+  return out;
+}
+
+static uint8_t pattern_byte(size_t idx) {
+  return (uint8_t)(0xA5U ^ (uint8_t)idx);
+}
+
+static void init_pattern(uint8_t *buf, size_t bytes) {
+  size_t words = bytes / 4U;
+  for (size_t w = 0; w < words; w++) {
+    size_t base = w * 4U;
+    uint32_t b0 = pattern_byte(base + 0U);
+    uint32_t b1 = pattern_byte(base + 1U);
+    uint32_t b2 = pattern_byte(base + 2U);
+    uint32_t b3 = pattern_byte(base + 3U);
+    uint32_t word = (b0) | (b1 << 8U) | (b2 << 16U) | (b3 << 24U);
+    store_w(buf + base, word);
+  }
+
+  for (size_t i = words * 4U; i < bytes; i++) {
+    buf[i] = pattern_byte(i);
+  }
+}
+
+static void init_vec_data(void) {
+  for (unsigned int i = 0; i < VEC_E8_LEN; i++) {
+    vec_data8[i] = (uint8_t)(0x10U + (i * 3U));
+    vec_idx8[i] = (uint8_t)(i * 3U);
+  }
+  for (unsigned int i = 0; i < VEC_E16_LEN; i++) {
+    vec_data16[i] = (uint16_t)(0x2000U + (i * 5U));
+    vec_idx16[i] = (uint16_t)(i * 4U);
+  }
+}
+
+static unsigned long long cycle_to_ns(size_t cycle) {
+  return (unsigned long long)cycle * 2ULL + 10ULL;
+}
+
+static void trace_inst(const char *name, const char *inst, const void *addr,
+                        size_t cycle) {
+  unsigned long long ns = cycle_to_ns(cycle);
+  printf("[TRACE] %s: %s @ 0x%08x cycle %u ns %llu\n", name, inst,
+         (unsigned int)(uintptr_t)addr, (unsigned int)cycle, ns);
+}
+
+static int check_store_and_load(const char *name, uint8_t *base,
+                                uint32_t offset, uint32_t size,
+                                uint32_t value) {
+  int errors = 0;
+
+  if (((uintptr_t)base & 0x3U) != 0U) {
+    printf("[FAIL] %s: base misaligned 0x%llx\n", name,
+           (unsigned long long)(uintptr_t)base);
+    return 1;
+  }
+
+  if ((offset + size) > 4U) {
+    printf("[FAIL] %s: offset+size out of range\n", name);
+    return 1;
+  }
+
+  uint32_t orig = (uint32_t)load_w(base);
+
+  const uint8_t *addr = base + offset;
+  const char *store_name = "s?";
+  size_t store_cycle = 0;
+
+  switch (size) {
+    case 1:
+      store_name = "sb";
+      store_cycle = benchmark_get_cycle();
+      store_b((void *)addr, (uint8_t)value);
+      break;
+    case 2:
+      store_name = "sh";
+      store_cycle = benchmark_get_cycle();
+      store_h((void *)addr, (uint16_t)value);
+      break;
+    case 4:
+      store_name = "sw";
+      store_cycle = benchmark_get_cycle();
+      store_w((void *)addr, (uint32_t)value);
+      break;
+    default:
+      printf("[FAIL] %s: invalid size %u\n", name, size);
+      return 1;
+  }
+
+  trace_inst(name, store_name, addr, store_cycle);
+
+  uint32_t after = (uint32_t)load_w(base);
+  uint32_t expected = orig;
+
+  uint32_t mask = (size == 1) ? 0xFFU
+                   : (size == 2) ? 0xFFFFU
+                                 : 0xFFFFFFFFU;
+  uint32_t shift = offset * 8U;
+  expected = (orig & ~(mask << shift)) | ((value & mask) << shift);
+
+  int store_ok = (after == expected);
+  if (!store_ok) {
+    printf("[FAIL] %s: store before 0x%08x expected 0x%08x got 0x%08x\n", name,
+           (unsigned int)orig, (unsigned int)expected, (unsigned int)after);
+    errors++;
+  }
+
+  int32_t load_got = 0;
+  int32_t load_exp = 0;
+  const char *load_name = "l?";
+  size_t load_cycle = 0;
+  int load_ok = 0;
+
+  if (size == 1) {
+    load_name = "lb";
+    load_cycle = benchmark_get_cycle();
+    load_got = load_b(addr);
+    load_exp = (int8_t)value;
+  } else if (size == 2) {
+    load_name = "lh";
+    load_cycle = benchmark_get_cycle();
+    load_got = load_h(addr);
+    load_exp = (int16_t)value;
+  } else if (size == 4) {
+    load_name = "lw";
+    load_cycle = benchmark_get_cycle();
+    load_got = load_w(addr);
+    load_exp = (int32_t)value;
+  }
+
+  trace_inst(name, load_name, addr, load_cycle);
+
+  load_ok = (load_got == load_exp);
+  if (!load_ok) {
+    printf("[FAIL] %s: %s before 0x%08x expected 0x%08x got 0x%08x\n", name,
+           load_name, (unsigned int)orig, (unsigned int)load_exp,
+           (unsigned int)load_got);
+    errors++;
+  }
+
+  if (store_ok) {
+    printf("[PASS] %s: store before 0x%08x expected 0x%08x got 0x%08x\n", name,
+           (unsigned int)orig, (unsigned int)expected, (unsigned int)after);
+  }
+  if (load_ok) {
+    printf("[PASS] %s: %s before 0x%08x expected 0x%08x got 0x%08x\n", name,
+           load_name, (unsigned int)orig, (unsigned int)load_exp,
+           (unsigned int)load_got);
+  }
+  return errors;
+}
+
+static int verify_vec_e8(const char *name, uint8_t *buf, size_t base_index,
+                         const uint8_t *vals, const uint16_t *offsets,
+                         size_t count, size_t region_bytes) {
+  int errors = 0;
+
+  for (size_t pos = 0; pos < region_bytes; pos++) {
+    uint8_t exp = pattern_byte(base_index + pos);
+    for (size_t i = 0; i < count; i++) {
+      if (offsets[i] == pos) {
+        exp = vals[i];
+        break;
+      }
+    }
+    uint8_t got = buf[base_index + pos];
+    if (got != exp) {
+      printf("[FAIL] %s: byte+%u expected 0x%02x got 0x%02x\n", name,
+             (unsigned int)pos, (unsigned int)exp, (unsigned int)got);
+      errors++;
+    }
+  }
+
+  if (errors == 0) {
+    printf("[PASS] %s: vector store verify\n", name);
+  }
+
+  return errors;
+}
+
+static int verify_vec_e16(const char *name, uint8_t *buf, size_t base_index,
+                          const uint16_t *vals, const uint16_t *offsets,
+                          size_t count, size_t region_bytes) {
+  int errors = 0;
+
+  for (size_t pos = 0; pos < region_bytes; pos++) {
+    uint8_t exp = pattern_byte(base_index + pos);
+    for (size_t i = 0; i < count; i++) {
+      if (offsets[i] == pos) {
+        exp = (uint8_t)(vals[i] & 0xFFU);
+        break;
+      } else if (offsets[i] + 1U == pos) {
+        exp = (uint8_t)((vals[i] >> 8U) & 0xFFU);
+        break;
+      }
+    }
+    uint8_t got = buf[base_index + pos];
+    if (got != exp) {
+      printf("[FAIL] %s: byte+%u expected 0x%02x got 0x%02x\n", name,
+             (unsigned int)pos, (unsigned int)exp, (unsigned int)got);
+      errors++;
+    }
+  }
+
+  if (errors == 0) {
+    printf("[PASS] %s: vector store verify\n", name);
+  }
+
+  return errors;
+}
+
+static int run_vector_tests(void) {
+  int errors = 0;
+  uint32_t avl;
+  uint32_t vlen;
+  uint16_t offsets16[VEC_E16_LEN];
+  uint16_t offsets8[VEC_E8_LEN];
+
+  init_pattern(vec_buf, VEC_BUF_BYTES);
+  init_vec_data();
+
+  const uint32_t stride_b = 2U;
+  const uint32_t stride_h = 4U;
+
+  const size_t base_e8_unit = 0U;
+  const size_t base_e8_stride = 64U;
+  const size_t base_e8_index = 128U;
+  const size_t base_e16_unit = 160U;
+  const size_t base_e16_stride = 192U;
+  const size_t base_e16_index = 224U;
+
+  avl = VEC_E8_LEN;
+  asm volatile("vsetvli %0, %1, e8, m1, ta, ma" : "=r"(vlen) : "r"(avl));
+  asm volatile("vle8.v v0, (%0)" :: "r"(vec_data8) : "memory");
+  size_t cyc = benchmark_get_cycle();
+  asm volatile("vse8.v v0, (%0)" :: "r"(vec_buf + base_e8_unit) : "memory");
+  trace_inst("vec e8 unit", "vse8.v", vec_buf + base_e8_unit, cyc);
+  for (unsigned int i = 0; i < VEC_E8_LEN; i++)
+    offsets8[i] = i;
+  errors += verify_vec_e8("vec e8 unit", vec_buf, base_e8_unit, vec_data8,
+                          offsets8, VEC_E8_LEN, VEC_E8_LEN);
+
+  avl = VEC_E8_LEN;
+  asm volatile("vsetvli %0, %1, e8, m1, ta, ma" : "=r"(vlen) : "r"(avl));
+  asm volatile("vle8.v v0, (%0)" :: "r"(vec_data8) : "memory");
+  cyc = benchmark_get_cycle();
+  asm volatile("vsse8.v v0, (%0), %1" :: "r"(vec_buf + base_e8_stride),
+               "r"(stride_b) : "memory");
+  trace_inst("vec e8 strided", "vsse8.v", vec_buf + base_e8_stride, cyc);
+  for (unsigned int i = 0; i < VEC_E8_LEN; i++)
+    offsets8[i] = i * stride_b;
+  errors += verify_vec_e8("vec e8 strided", vec_buf, base_e8_stride, vec_data8,
+                          offsets8, VEC_E8_LEN,
+                          (VEC_E8_LEN - 1U) * stride_b + 1U);
+
+  avl = VEC_E8_LEN;
+  asm volatile("vsetvli %0, %1, e8, m1, ta, ma" : "=r"(vlen) : "r"(avl));
+  asm volatile("vle8.v v0, (%0)" :: "r"(vec_data8) : "memory");
+  asm volatile("vle8.v v1, (%0)" :: "r"(vec_idx8) : "memory");
+  cyc = benchmark_get_cycle();
+  asm volatile("vsuxei8.v v0, (%0), v1" :: "r"(vec_buf + base_e8_index)
+               : "memory");
+  trace_inst("vec e8 indexed", "vsuxei8.v", vec_buf + base_e8_index, cyc);
+  for (unsigned int i = 0; i < VEC_E8_LEN; i++)
+    offsets8[i] = vec_idx8[i];
+  errors += verify_vec_e8("vec e8 indexed", vec_buf, base_e8_index, vec_data8,
+                          offsets8, VEC_E8_LEN,
+                          offsets8[VEC_E8_LEN - 1U] + 1U);
+
+  avl = VEC_E16_LEN;
+  asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vlen) : "r"(avl));
+  asm volatile("vle16.v v0, (%0)" :: "r"(vec_data16) : "memory");
+  cyc = benchmark_get_cycle();
+  asm volatile("vse16.v v0, (%0)" :: "r"(vec_buf + base_e16_unit) : "memory");
+  trace_inst("vec e16 unit", "vse16.v", vec_buf + base_e16_unit, cyc);
+  for (unsigned int i = 0; i < VEC_E16_LEN; i++)
+    offsets16[i] = i * 2U;
+  errors += verify_vec_e16("vec e16 unit", vec_buf, base_e16_unit, vec_data16,
+                           offsets16, VEC_E16_LEN, VEC_E16_LEN * 2U);
+
+  avl = VEC_E16_LEN;
+  asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vlen) : "r"(avl));
+  asm volatile("vle16.v v0, (%0)" :: "r"(vec_data16) : "memory");
+  cyc = benchmark_get_cycle();
+  asm volatile("vsse16.v v0, (%0), %1" :: "r"(vec_buf + base_e16_stride),
+               "r"(stride_h) : "memory");
+  trace_inst("vec e16 strided", "vsse16.v", vec_buf + base_e16_stride, cyc);
+  for (unsigned int i = 0; i < VEC_E16_LEN; i++)
+    offsets16[i] = i * stride_h;
+  errors += verify_vec_e16("vec e16 strided", vec_buf, base_e16_stride,
+                           vec_data16, offsets16, VEC_E16_LEN,
+                           (VEC_E16_LEN - 1U) * stride_h + 2U);
+
+  avl = VEC_E16_LEN;
+  asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vlen) : "r"(avl));
+  asm volatile("vle16.v v0, (%0)" :: "r"(vec_data16) : "memory");
+  asm volatile("vle16.v v1, (%0)" :: "r"(vec_idx16) : "memory");
+  cyc = benchmark_get_cycle();
+  asm volatile("vsuxei16.v v0, (%0), v1" :: "r"(vec_buf + base_e16_index)
+               : "memory");
+  trace_inst("vec e16 indexed", "vsuxei16.v", vec_buf + base_e16_index, cyc);
+  for (unsigned int i = 0; i < VEC_E16_LEN; i++)
+    offsets16[i] = vec_idx16[i];
+  errors += verify_vec_e16("vec e16 indexed", vec_buf, base_e16_index,
+                           vec_data16, offsets16, VEC_E16_LEN,
+                           offsets16[VEC_E16_LEN - 1U] + 2U);
+
+  (void)vlen;
+  return errors;
+}
+
+int main(void) {
+  const unsigned int core_id = snrt_cluster_core_idx();
+
+  if (core_id == 0) {
+    l1d_init(0);
+    uint32_t offset = 31U - __builtin_clz((unsigned int)L1LineWidth);
+    l1d_xbar_config(offset);
+  }
+
+  snrt_cluster_hw_barrier();
+
+  int errors = 0;
+
+  if (core_id == 0) {
+    init_pattern(test_buf, BUF_BYTES);
+    printf("original data (line order, high->low addr):\n");
+    for (unsigned int line = 0; line < BUF_LINES; line++) {
+      printf("line %u: ", line);
+      for (unsigned int byte = 0; byte < L1LineWidth; byte++) {
+        unsigned int idx = line * L1LineWidth + (L1LineWidth - 1U - byte);
+        printf("%02x ", (unsigned int)test_buf[idx]);
+      }
+      printf("\n");
+    }
+
+#if ENABLE_SCALAR_TESTS
+    errors += check_store_and_load("sb/lb line0+1", test_buf, 1, 1, 0x80U);
+
+    errors += check_store_and_load("sh/lh line0+4+2", test_buf + 4, 2, 2,
+                                   0x8001U);
+
+    errors += check_store_and_load("sw/lw line0+16+0", test_buf + 16, 0, 4,
+                                   0x80000005U);
+
+    errors += check_store_and_load("sb/lb line1+3", test_buf + L1LineWidth, 3,
+                                   1, 0x7FU);
+#endif
+
+#if ENABLE_VECTOR_TESTS
+    errors += run_vector_tests();
+#endif
+
+    if (errors == 0) {
+      printf("Byte-enable test PASSED\n");
+    } else {
+      printf("Byte-enable test FAILED: %d errors\n", errors);
+    }
+  }
+
+  snrt_cluster_hw_barrier();
+
+  if (core_id == 0) {
+    set_eoc();
+  }
+
+  return 0;
+}
diff --git a/util/auto-benchmark/configs.sh b/util/auto-benchmark/configs.sh
index f3013c6..284cd26 100755
--- a/util/auto-benchmark/configs.sh
+++ b/util/auto-benchmark/configs.sh
@@ -2,7 +2,7 @@
 # CONFIGS="cachepool_fpu_512"
 # KERNELS="spin-lock fdotp-32b_M32768"
 CONFIGS="cachepool_fpu_128 cachepool_fpu_256 cachepool_fpu_512"
-KERNELS="spin-lock fdotp-32b_M8192 fmatmul-32b_M32_N32_K32"
-# KERNELS="spin-lock fdotp-32b_M32768 gemv-col_M256_N128_K32 fmatmul-32b_M32_N32_K32 fmatmul-32b_M64_N64_K64"
+# KERNELS="spin-lock fdotp-32b_M8192 fmatmul-32b_M32_N32_K32"
+KERNELS="spin-lock fdotp-32b_M32768 gemv-col_M256_N128_K32 fmatmul-32b_M32_N32_K32 fmatmul-32b_M64_N64_K64 multi_producer_single_consumer_double_linked_list_M1_N1350_K100 byte-enable"
 PREFIX="test-cachepool-"  # common prefix for all kernels
 ROOT_PATH=../..           # adjust if needed (path to repo root)
diff --git a/util/auto-benchmark/run_all.sh b/util/auto-benchmark/run_all.sh
index 77827b7..ff6fba6 100755
--- a/util/auto-benchmark/run_all.sh
+++ b/util/auto-benchmark/run_all.sh
@@ -1,6 +1,18 @@
 #!/usr/bin/env bash
+# Refuse to be sourced to avoid killing the interactive shell on errors/interrupts.
+if [[ "${BASH_SOURCE[0]}" != "$0" ]]; then
+  echo "Error: do not source this script; run it as ./run_all.sh" >&2
+  return 1
+fi
+
 set -e
 
+cleanup() {
+  echo
+  echo "[INFO] Interrupted; stopping batch run."
+}
+trap 'cleanup; exit 130' INT TERM
+
 # Load user configs
 source ./configs.sh