From 1aef3467379fa979c29e145b0a6329a2a35955d6 Mon Sep 17 00:00:00 2001 From: Zexin Fu Date: Wed, 7 Jan 2026 18:51:14 +0100 Subject: [PATCH 1/7] [SW] Add software for insitu cache byte/half-word/word access test. --- software/tests/CMakeLists.txt | 1 + software/tests/byte-enable/main.c | 222 ++++++++++++++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100644 software/tests/byte-enable/main.c diff --git a/software/tests/CMakeLists.txt b/software/tests/CMakeLists.txt index 786c4c5..54013c8 100644 --- a/software/tests/CMakeLists.txt +++ b/software/tests/CMakeLists.txt @@ -85,6 +85,7 @@ set(SNITCH_TEST_PREFIX cachepool-) ## RLC add_spatz_test_zeroParam(spin-lock spin-lock/main.c) add_spatz_test_zeroParam(mcs-lock mcs-lock/main.c) +add_spatz_test_zeroParam(byte-enable byte-enable/main.c) # add_snitch_test(multi_producer_single_consumer_double_linked_list multi_producer_single_consumer_double_linked_list/main.c) # add_spatz_test_threeParam(multi_producer_single_consumer_double_linked_list multi_producer_single_consumer_double_linked_list/main.c 1 1350 1000) diff --git a/software/tests/byte-enable/main.c b/software/tests/byte-enable/main.c new file mode 100644 index 0000000..86407e7 --- /dev/null +++ b/software/tests/byte-enable/main.c @@ -0,0 +1,222 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Zexin Fu + +#include +#include +#include +#include +#include +#include "printf.h" +#ifdef DATAHEADER +#include DATAHEADER +#endif + +#define L1LineWidth (512 / 8) // 512 bits +#define BUF_LINES 2 +#define BUF_BYTES (L1LineWidth * BUF_LINES) + +static uint8_t test_buf[BUF_BYTES] __attribute__((aligned(L1LineWidth))) + __attribute__((section(".data"))); + +static inline void store_b(void *addr, uint8_t value) { + asm volatile("sb %0, 0(%1)" :: "r"(value), "r"(addr) : "memory"); +} + +static inline void store_h(void *addr, uint16_t value) { + asm volatile("sh %0, 0(%1)" :: "r"(value), "r"(addr) : "memory"); +} + +static inline void store_w(void *addr, uint32_t value) { + asm volatile("sw %0, 0(%1)" :: "r"(value), "r"(addr) : "memory"); +} + + +static inline int32_t load_b(const void *addr) { + int32_t out; + asm volatile("lb %0, 0(%1)" : "=r"(out) : "r"(addr) : "memory"); + return out; +} + +static inline int32_t load_h(const void *addr) { + int32_t out; + asm volatile("lh %0, 0(%1)" : "=r"(out) : "r"(addr) : "memory"); + return out; +} + +static inline int32_t load_w(const void *addr) { + int32_t out; + asm volatile("lw %0, 0(%1)" : "=r"(out) : "r"(addr) : "memory"); + return out; +} + +static void init_pattern(uint8_t *buf, size_t bytes) { + size_t words = bytes / 4U; + for (size_t w = 0; w < words; w++) { + size_t base = w * 4U; + uint32_t b0 = (uint8_t)(0xA5U ^ (uint8_t)(base + 0U)); + uint32_t b1 = (uint8_t)(0xA5U ^ (uint8_t)(base + 1U)); + uint32_t b2 = (uint8_t)(0xA5U ^ (uint8_t)(base + 2U)); + uint32_t b3 = (uint8_t)(0xA5U ^ (uint8_t)(base + 3U)); + uint32_t word = (b0) | (b1 << 8U) | (b2 << 16U) | (b3 << 24U); + store_w(buf + base, word); + } + + for (size_t i = words * 4U; i < bytes; i++) { + buf[i] = (uint8_t)(0xA5U ^ (uint8_t)i); + } +} + +static int check_store_and_load(const char *name, uint8_t *base, + uint32_t offset, uint32_t size, + uint32_t value) { + int errors = 0; + + if (((uintptr_t)base & 0x3U) != 0U) { + printf("[FAIL] %s: base misaligned 0x%llx\n", name, + (unsigned long long)(uintptr_t)base); + return 1; + } + + if ((offset + size) > 4U) { + printf("[FAIL] %s: offset+size out of range\n", name); + return 1; + } + + uint32_t orig = (uint32_t)load_w(base); + + switch (size) { + case 1: + store_b(base + offset, (uint8_t)value); + break; + case 2: + store_h(base + offset, (uint16_t)value); + break; + case 4: + store_w(base + offset, (uint32_t)value); + break; + default: + printf("[FAIL] %s: invalid size %u\n", name, size); + return 1; + } + + uint32_t after = (uint32_t)load_w(base); + uint32_t expected = orig; + + uint32_t mask = (size == 1) ? 0xFFU + : (size == 2) ? 0xFFFFU + : 0xFFFFFFFFU; + uint32_t shift = offset * 8U; + expected = (orig & ~(mask << shift)) | ((value & mask) << shift); + + int store_ok = (after == expected); + if (!store_ok) { + printf("[FAIL] %s: store before 0x%08x expected 0x%08x got 0x%08x\n", name, + (unsigned int)orig, (unsigned int)expected, (unsigned int)after); + errors++; + } + + int32_t load_got = 0; + int32_t load_exp = 0; + const char *load_name = "l?"; + int load_ok = 0; + + if (size == 1) { + load_name = "lb"; + load_got = load_b(base + offset); + load_exp = (int8_t)value; + } else if (size == 2) { + load_name = "lh"; + load_got = load_h(base + offset); + load_exp = (int16_t)value; + } else if (size == 4) { + load_name = "lw"; + load_got = load_w(base + offset); + load_exp = (int32_t)value; + } + + load_ok = (load_got == load_exp); + if (!load_ok) { + printf("[FAIL] %s: %s before 0x%08x expected 0x%08x got 0x%08x\n", name, + load_name, (unsigned int)orig, (unsigned int)load_exp, + (unsigned int)load_got); + errors++; + } + + if (store_ok) { + printf("[PASS] %s: store before 0x%08x expected 0x%08x got 0x%08x\n", name, + (unsigned int)orig, (unsigned int)expected, (unsigned int)after); + } + if (load_ok) { + printf("[PASS] %s: %s before 0x%08x expected 0x%08x got 0x%08x\n", name, + load_name, (unsigned int)orig, (unsigned int)load_exp, + (unsigned int)load_got); + } + return errors; +} + +int main(void) { + const unsigned int core_id = snrt_cluster_core_idx(); + + if (core_id == 0) { + l1d_init(0); + uint32_t offset = 31U - __builtin_clz((unsigned int)L1LineWidth); + l1d_xbar_config(offset); + } + + snrt_cluster_hw_barrier(); + + int errors = 0; + + if (core_id == 0) { + init_pattern(test_buf, BUF_BYTES); + printf("original data (line order, high->low addr):\n"); + for (unsigned int line = 0; line < BUF_LINES; line++) { + printf("line %u: ", line); + for (unsigned int byte = 0; byte < L1LineWidth; byte++) { + unsigned int idx = line * L1LineWidth + (L1LineWidth - 1U - byte); + printf("%02x ", (unsigned int)test_buf[idx]); + } + printf("\n"); + } + + errors += check_store_and_load("sb/lb line0+1", test_buf, 1, 1, 0x80U); + + errors += check_store_and_load("sh/lh line0+4+2", test_buf + 4, 2, 2, + 0x8001U); + + errors += check_store_and_load("sw/lw line0+16+0", test_buf + 16, 0, 4, + 0x80000005U); + + errors += check_store_and_load("sb/lb line1+3", test_buf + L1LineWidth, 3, + 1, 0x7FU); + + if (errors == 0) { + printf("Byte-enable test PASSED\n"); + } else { + printf("Byte-enable test FAILED: %d errors\n", errors); + } + } + + snrt_cluster_hw_barrier(); + + if (core_id == 0) { + set_eoc(); + } + + return 0; +} From 8b78a290bbf10e5b78f20627c6fa006240c12d93 Mon Sep 17 00:00:00 2001 From: Zexin Fu Date: Thu, 8 Jan 2026 19:29:10 +0100 Subject: [PATCH 2/7] [SW] Add more info output for cache byte access test. --- software/tests/byte-enable/main.c | 42 ++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/software/tests/byte-enable/main.c b/software/tests/byte-enable/main.c index 86407e7..f4701a3 100644 --- a/software/tests/byte-enable/main.c +++ b/software/tests/byte-enable/main.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "printf.h" #ifdef DATAHEADER #include DATAHEADER @@ -81,6 +82,17 @@ static void init_pattern(uint8_t *buf, size_t bytes) { } } +static unsigned long long cycle_to_ns(size_t cycle) { + return (unsigned long long)cycle * 2ULL + 10ULL; +} + +static void trace_inst(const char *name, const char *inst, const void *addr, + size_t cycle) { + unsigned long long ns = cycle_to_ns(cycle); + printf("[TRACE] %s: %s @ 0x%08x cycle %u ns %llu\n", name, inst, + (unsigned int)(uintptr_t)addr, (unsigned int)cycle, ns); +} + static int check_store_and_load(const char *name, uint8_t *base, uint32_t offset, uint32_t size, uint32_t value) { @@ -99,21 +111,33 @@ static int check_store_and_load(const char *name, uint8_t *base, uint32_t orig = (uint32_t)load_w(base); + const uint8_t *addr = base + offset; + const char *store_name = "s?"; + size_t store_cycle = 0; + switch (size) { case 1: - store_b(base + offset, (uint8_t)value); + store_name = "sb"; + store_cycle = benchmark_get_cycle(); + store_b((void *)addr, (uint8_t)value); break; case 2: - store_h(base + offset, (uint16_t)value); + store_name = "sh"; + store_cycle = benchmark_get_cycle(); + store_h((void *)addr, (uint16_t)value); break; case 4: - store_w(base + offset, (uint32_t)value); + store_name = "sw"; + store_cycle = benchmark_get_cycle(); + store_w((void *)addr, (uint32_t)value); break; default: printf("[FAIL] %s: invalid size %u\n", name, size); return 1; } + trace_inst(name, store_name, addr, store_cycle); + uint32_t after = (uint32_t)load_w(base); uint32_t expected = orig; @@ -133,22 +157,28 @@ static int check_store_and_load(const char *name, uint8_t *base, int32_t load_got = 0; int32_t load_exp = 0; const char *load_name = "l?"; + size_t load_cycle = 0; int load_ok = 0; if (size == 1) { load_name = "lb"; - load_got = load_b(base + offset); + load_cycle = benchmark_get_cycle(); + load_got = load_b(addr); load_exp = (int8_t)value; } else if (size == 2) { load_name = "lh"; - load_got = load_h(base + offset); + load_cycle = benchmark_get_cycle(); + load_got = load_h(addr); load_exp = (int16_t)value; } else if (size == 4) { load_name = "lw"; - load_got = load_w(base + offset); + load_cycle = benchmark_get_cycle(); + load_got = load_w(addr); load_exp = (int32_t)value; } + trace_inst(name, load_name, addr, load_cycle); + load_ok = (load_got == load_exp); if (!load_ok) { printf("[FAIL] %s: %s before 0x%08x expected 0x%08x got 0x%08x\n", name, From 8f1d09bba2514703ef54b01c3c400159fd0fb0a7 Mon Sep 17 00:00:00 2001 From: Zexin Fu Date: Fri, 9 Jan 2026 16:56:40 +0100 Subject: [PATCH 3/7] [RTL] wire byte strobes into L1 data banks - pass core wstrb into cachepool_cache_ctrl and use per-byte bank enables - map wide line SRAMs to byte-wide BE slices in cachepool_tile - bump 512b line tag/meta width to avoid truncation with byte masks - update local build/sim overrides used to run the modified insitu-cache --- config/cachepool_512.mk | 2 +- config/cachepool_fpu_512.mk | 2 +- hardware/src/cachepool_tile.sv | 47 ++++++++++++++++++++++++++-------- 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/config/cachepool_512.mk b/config/cachepool_512.mk index 8970304..05992d5 100644 --- a/config/cachepool_512.mk +++ b/config/cachepool_512.mk @@ -52,7 +52,7 @@ l1d_num_way ?= 4 l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) -l1d_tag_data_width ?= 52 +l1d_tag_data_width ?= 92 #################### ## CachePool CC ## diff --git a/config/cachepool_fpu_512.mk b/config/cachepool_fpu_512.mk index c922ef3..c51eccd 100644 --- a/config/cachepool_fpu_512.mk +++ b/config/cachepool_fpu_512.mk @@ -54,7 +54,7 @@ l1d_num_way ?= 4 l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) -l1d_tag_data_width ?= 52 +l1d_tag_data_width ?= 92 #################### ## CachePool CC ## diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index 4a943a0..9dcf9d6 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -415,6 +415,7 @@ module cachepool_tile tcdm_user_t [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_req_meta; logic [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_req_write; data_t [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_req_data; + strb_t [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_req_strb; logic [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_valid; logic [NumL1CtrlTile-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_ready; @@ -433,7 +434,7 @@ module cachepool_tile logic [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_we; tcdm_bank_addr_t [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_addr; data_t [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_wdata; - logic [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_be; + logic [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0][DataWidth/8-1:0] l1_data_bank_be; data_t [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_rdata; logic [NumL1CtrlTile-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_gnt; @@ -630,6 +631,7 @@ module cachepool_tile assign cache_req_meta [cb][j] = cache_req_reg.q.user; assign cache_req_write[cb][j] = cache_req_reg.q.write; assign cache_req_data [cb][j] = cache_req_reg.q.data; + assign cache_req_strb [cb][j] = cache_req_reg.q.strb; assign cache_rsp_reg.p_valid = cache_rsp_valid[cb][j]; assign cache_rsp_reg.q_ready = cache_req_ready[cb][j]; @@ -646,6 +648,7 @@ module cachepool_tile assign cache_req_meta [cb][j] = cache_xbar_req [j][cb].q.user; assign cache_req_write[cb][j] = cache_xbar_req [j][cb].q.write; assign cache_req_data [cb][j] = cache_xbar_req [j][cb].q.data; + assign cache_req_strb [cb][j] = cache_xbar_req [j][cb].q.strb; assign cache_xbar_rsp[j][cb].p_valid = cache_rsp_valid[cb][j]; assign cache_xbar_rsp[j][cb].q_ready = cache_req_ready[cb][j]; @@ -661,6 +664,21 @@ module cachepool_tile // For address scrambling localparam NumSelBits = $clog2(NumL1CtrlTile); localparam NumWordPerLine = L1LineWidth / DataWidth; + localparam int unsigned WordBytes = DataWidth / 8; + initial begin + $display("Cache Configuration:"); + $display(" NumCtrl : %0d", NumL1CtrlTile); + $display(" LineWidth : %0d", L1LineWidth); + $display(" NumWordPerLine : %0d", NumWordPerLine); + $display(" NumSet : %0d", L1NumSet); + $display(" AssoPerCtrl : %0d", L1AssoPerCtrl); + $display(" BankFactor : %0d", L1BankFactor); + $display(" NumTagBankPerCtrl : %0d", NumTagBankPerCtrl); + $display(" NumDataBankPerCtrl: %0d", NumDataBankPerCtrl); + $display(" CoalFactor : %0d", L1CoalFactor); + $display(" RefillDataWidth: %0d", RefillDataWidth); + $display(" DynamicOffset : %0d", dynamic_offset); + end logic [SpatzAxiAddrWidth-1:0] bitmask_up, bitmask_lo; assign bitmask_lo = (1 << dynamic_offset) - 1; // We will keep AddrWidth - Offset - log2(CacheBanks) bits in the upper half, and add back the NumSelBits bits @@ -679,6 +697,7 @@ module cachepool_tile .CoalExtFactor (L1CoalFactor ), .AddrWidth (L1AddrWidth ), .WordWidth (DataWidth ), + .ByteWidth (8 ), .TagWidth (L1TagDataWidth ), // Cache .NumCacheEntry (L1NumEntryPerCtrl ), @@ -712,6 +731,7 @@ module cachepool_tile .core_req_meta_i (cache_req_meta [cb] ), .core_req_write_i (cache_req_write[cb] ), .core_req_wdata_i (cache_req_data [cb] ), + .core_req_wstrb_i (cache_req_strb [cb] ), // Response .core_resp_valid_o (cache_rsp_valid[cb] ), .core_resp_ready_i (cache_rsp_ready[cb] ), @@ -810,11 +830,18 @@ module cachepool_tile end // TODO: Should we use a single large bank or multiple narrow ones? - for (genvar j = 0; j < NumDataBankPerCtrl; j = j+NumWordPerLine) begin : gen_l1_data_banks + for (genvar bank = 0; bank < NumDataBankPerCtrl/NumWordPerLine; bank++) begin : gen_l1_data_banks + localparam int unsigned BaseIdx = bank * NumWordPerLine; + logic [NumWordPerLine*WordBytes-1:0] bank_be; + + for (genvar w = 0; w < NumWordPerLine; w++) begin : gen_bank_be + assign bank_be[w*WordBytes +: WordBytes] = l1_data_bank_be[cb][BaseIdx + w]; + end + tc_sram_impl #( .NumWords (L1CacheWayEntry/L1BankFactor), .DataWidth (L1LineWidth), - .ByteWidth (DataWidth ), + .ByteWidth (8 ), .NumPorts (1 ), .Latency (1 ), .SimInit ("zeros" ) @@ -823,15 +850,15 @@ module cachepool_tile .rst_ni (rst_ni ), .impl_i ('0 ), .impl_o (/* unsed */ ), - .req_i ( l1_data_bank_req [cb][j] ), - .we_i ( l1_data_bank_we [cb][j] ), - .addr_i ( l1_data_bank_addr [cb][j] ), - .wdata_i( l1_data_bank_wdata[cb][j+:NumWordPerLine]), - .be_i ( l1_data_bank_be [cb][j+:NumWordPerLine]), - .rdata_o( l1_data_bank_rdata[cb][j+:NumWordPerLine]) + .req_i ( l1_data_bank_req [cb][BaseIdx] ), + .we_i ( l1_data_bank_we [cb][BaseIdx] ), + .addr_i ( l1_data_bank_addr [cb][BaseIdx] ), + .wdata_i( l1_data_bank_wdata[cb][BaseIdx+:NumWordPerLine]), + .be_i ( bank_be ), + .rdata_o( l1_data_bank_rdata[cb][BaseIdx+:NumWordPerLine]) ); - assign l1_data_bank_gnt[cb][j+:NumWordPerLine] = {NumWordPerLine{1'b1}}; + assign l1_data_bank_gnt[cb][BaseIdx+:NumWordPerLine] = {NumWordPerLine{1'b1}}; end // for (genvar j = 0; j < NumDataBankPerCtrl; j++) begin : gen_l1_data_banks From 9f192fe28a0f94e1b80a69bc4dbd19b9e28820af Mon Sep 17 00:00:00 2001 From: Zexin Fu Date: Fri, 9 Jan 2026 22:10:45 +0100 Subject: [PATCH 4/7] Update the insitu-cache dep which supports byte access. --- Bender.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Bender.lock b/Bender.lock index 958e1ed..138e86b 100644 --- a/Bender.lock +++ b/Bender.lock @@ -71,7 +71,7 @@ packages: - common_verification - register_interface insitu-cache: - revision: 04f72a7ac7e9091f1820f0dac59bb778b134d7f7 + revision: 57c0884166dd0f1b7c484633b437fe11d5d62c89 version: null source: Git: https://github.com/pulp-platform/Insitu-Cache.git From 49cdcd5589b649c8f02d1851babc9886f32add7a Mon Sep 17 00:00:00 2001 From: Zexin Fu Date: Fri, 9 Jan 2026 22:53:47 +0100 Subject: [PATCH 5/7] [SW] Add vector byte/half-word test. --- software/tests/byte-enable/main.c | 213 +++++++++++++++++++++++++++++- 1 file changed, 208 insertions(+), 5 deletions(-) diff --git a/software/tests/byte-enable/main.c b/software/tests/byte-enable/main.c index f4701a3..e0e37b6 100644 --- a/software/tests/byte-enable/main.c +++ b/software/tests/byte-enable/main.c @@ -31,9 +31,32 @@ #define BUF_LINES 2 #define BUF_BYTES (L1LineWidth * BUF_LINES) +#ifndef ENABLE_SCALAR_TESTS +#define ENABLE_SCALAR_TESTS 1 +#endif + +#ifndef ENABLE_VECTOR_TESTS +#define ENABLE_VECTOR_TESTS 1 +#endif + +#define VEC_E8_LEN 16U +#define VEC_E16_LEN 8U +#define VEC_BUF_BYTES 256U + static uint8_t test_buf[BUF_BYTES] __attribute__((aligned(L1LineWidth))) __attribute__((section(".data"))); +static uint8_t vec_buf[VEC_BUF_BYTES] __attribute__((aligned(64))) + __attribute__((section(".data"))); +static uint8_t vec_data8[VEC_E8_LEN] __attribute__((aligned(4))) + __attribute__((section(".data"))); +static uint16_t vec_data16[VEC_E16_LEN] __attribute__((aligned(4))) + __attribute__((section(".data"))); +static uint8_t vec_idx8[VEC_E8_LEN] __attribute__((aligned(4))) + __attribute__((section(".data"))); +static uint16_t vec_idx16[VEC_E16_LEN] __attribute__((aligned(4))) + __attribute__((section(".data"))); + static inline void store_b(void *addr, uint8_t value) { asm volatile("sb %0, 0(%1)" :: "r"(value), "r"(addr) : "memory"); } @@ -65,20 +88,35 @@ static inline int32_t load_w(const void *addr) { return out; } +static uint8_t pattern_byte(size_t idx) { + return (uint8_t)(0xA5U ^ (uint8_t)idx); +} + static void init_pattern(uint8_t *buf, size_t bytes) { size_t words = bytes / 4U; for (size_t w = 0; w < words; w++) { size_t base = w * 4U; - uint32_t b0 = (uint8_t)(0xA5U ^ (uint8_t)(base + 0U)); - uint32_t b1 = (uint8_t)(0xA5U ^ (uint8_t)(base + 1U)); - uint32_t b2 = (uint8_t)(0xA5U ^ (uint8_t)(base + 2U)); - uint32_t b3 = (uint8_t)(0xA5U ^ (uint8_t)(base + 3U)); + uint32_t b0 = pattern_byte(base + 0U); + uint32_t b1 = pattern_byte(base + 1U); + uint32_t b2 = pattern_byte(base + 2U); + uint32_t b3 = pattern_byte(base + 3U); uint32_t word = (b0) | (b1 << 8U) | (b2 << 16U) | (b3 << 24U); store_w(buf + base, word); } for (size_t i = words * 4U; i < bytes; i++) { - buf[i] = (uint8_t)(0xA5U ^ (uint8_t)i); + buf[i] = pattern_byte(i); + } +} + +static void init_vec_data(void) { + for (unsigned int i = 0; i < VEC_E8_LEN; i++) { + vec_data8[i] = (uint8_t)(0x10U + (i * 3U)); + vec_idx8[i] = (uint8_t)(i * 3U); + } + for (unsigned int i = 0; i < VEC_E16_LEN; i++) { + vec_data16[i] = (uint16_t)(0x2000U + (i * 5U)); + vec_idx16[i] = (uint16_t)(i * 4U); } } @@ -199,6 +237,165 @@ static int check_store_and_load(const char *name, uint8_t *base, return errors; } +static int verify_vec_e8(const char *name, uint8_t *buf, size_t base_index, + const uint8_t *vals, const uint16_t *offsets, + size_t count, size_t region_bytes) { + int errors = 0; + + for (size_t pos = 0; pos < region_bytes; pos++) { + uint8_t exp = pattern_byte(base_index + pos); + for (size_t i = 0; i < count; i++) { + if (offsets[i] == pos) { + exp = vals[i]; + break; + } + } + uint8_t got = buf[base_index + pos]; + if (got != exp) { + printf("[FAIL] %s: byte+%u expected 0x%02x got 0x%02x\n", name, + (unsigned int)pos, (unsigned int)exp, (unsigned int)got); + errors++; + } + } + + if (errors == 0) { + printf("[PASS] %s: vector store verify\n", name); + } + + return errors; +} + +static int verify_vec_e16(const char *name, uint8_t *buf, size_t base_index, + const uint16_t *vals, const uint16_t *offsets, + size_t count, size_t region_bytes) { + int errors = 0; + + for (size_t pos = 0; pos < region_bytes; pos++) { + uint8_t exp = pattern_byte(base_index + pos); + for (size_t i = 0; i < count; i++) { + if (offsets[i] == pos) { + exp = (uint8_t)(vals[i] & 0xFFU); + break; + } else if (offsets[i] + 1U == pos) { + exp = (uint8_t)((vals[i] >> 8U) & 0xFFU); + break; + } + } + uint8_t got = buf[base_index + pos]; + if (got != exp) { + printf("[FAIL] %s: byte+%u expected 0x%02x got 0x%02x\n", name, + (unsigned int)pos, (unsigned int)exp, (unsigned int)got); + errors++; + } + } + + if (errors == 0) { + printf("[PASS] %s: vector store verify\n", name); + } + + return errors; +} + +static int run_vector_tests(void) { + int errors = 0; + uint32_t avl; + uint32_t vlen; + uint16_t offsets16[VEC_E16_LEN]; + uint16_t offsets8[VEC_E8_LEN]; + + init_pattern(vec_buf, VEC_BUF_BYTES); + init_vec_data(); + + const uint32_t stride_b = 2U; + const uint32_t stride_h = 4U; + + const size_t base_e8_unit = 0U; + const size_t base_e8_stride = 64U; + const size_t base_e8_index = 128U; + const size_t base_e16_unit = 160U; + const size_t base_e16_stride = 192U; + const size_t base_e16_index = 224U; + + avl = VEC_E8_LEN; + asm volatile("vsetvli %0, %1, e8, m1, ta, ma" : "=r"(vlen) : "r"(avl)); + asm volatile("vle8.v v0, (%0)" :: "r"(vec_data8) : "memory"); + size_t cyc = benchmark_get_cycle(); + asm volatile("vse8.v v0, (%0)" :: "r"(vec_buf + base_e8_unit) : "memory"); + trace_inst("vec e8 unit", "vse8.v", vec_buf + base_e8_unit, cyc); + for (unsigned int i = 0; i < VEC_E8_LEN; i++) + offsets8[i] = i; + errors += verify_vec_e8("vec e8 unit", vec_buf, base_e8_unit, vec_data8, + offsets8, VEC_E8_LEN, VEC_E8_LEN); + + avl = VEC_E8_LEN; + asm volatile("vsetvli %0, %1, e8, m1, ta, ma" : "=r"(vlen) : "r"(avl)); + asm volatile("vle8.v v0, (%0)" :: "r"(vec_data8) : "memory"); + cyc = benchmark_get_cycle(); + asm volatile("vsse8.v v0, (%0), %1" :: "r"(vec_buf + base_e8_stride), + "r"(stride_b) : "memory"); + trace_inst("vec e8 strided", "vsse8.v", vec_buf + base_e8_stride, cyc); + for (unsigned int i = 0; i < VEC_E8_LEN; i++) + offsets8[i] = i * stride_b; + errors += verify_vec_e8("vec e8 strided", vec_buf, base_e8_stride, vec_data8, + offsets8, VEC_E8_LEN, + (VEC_E8_LEN - 1U) * stride_b + 1U); + + avl = VEC_E8_LEN; + asm volatile("vsetvli %0, %1, e8, m1, ta, ma" : "=r"(vlen) : "r"(avl)); + asm volatile("vle8.v v0, (%0)" :: "r"(vec_data8) : "memory"); + asm volatile("vle8.v v1, (%0)" :: "r"(vec_idx8) : "memory"); + cyc = benchmark_get_cycle(); + asm volatile("vsuxei8.v v0, (%0), v1" :: "r"(vec_buf + base_e8_index) + : "memory"); + trace_inst("vec e8 indexed", "vsuxei8.v", vec_buf + base_e8_index, cyc); + for (unsigned int i = 0; i < VEC_E8_LEN; i++) + offsets8[i] = vec_idx8[i]; + errors += verify_vec_e8("vec e8 indexed", vec_buf, base_e8_index, vec_data8, + offsets8, VEC_E8_LEN, + offsets8[VEC_E8_LEN - 1U] + 1U); + + avl = VEC_E16_LEN; + asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vlen) : "r"(avl)); + asm volatile("vle16.v v0, (%0)" :: "r"(vec_data16) : "memory"); + cyc = benchmark_get_cycle(); + asm volatile("vse16.v v0, (%0)" :: "r"(vec_buf + base_e16_unit) : "memory"); + trace_inst("vec e16 unit", "vse16.v", vec_buf + base_e16_unit, cyc); + for (unsigned int i = 0; i < VEC_E16_LEN; i++) + offsets16[i] = i * 2U; + errors += verify_vec_e16("vec e16 unit", vec_buf, base_e16_unit, vec_data16, + offsets16, VEC_E16_LEN, VEC_E16_LEN * 2U); + + avl = VEC_E16_LEN; + asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vlen) : "r"(avl)); + asm volatile("vle16.v v0, (%0)" :: "r"(vec_data16) : "memory"); + cyc = benchmark_get_cycle(); + asm volatile("vsse16.v v0, (%0), %1" :: "r"(vec_buf + base_e16_stride), + "r"(stride_h) : "memory"); + trace_inst("vec e16 strided", "vsse16.v", vec_buf + base_e16_stride, cyc); + for (unsigned int i = 0; i < VEC_E16_LEN; i++) + offsets16[i] = i * stride_h; + errors += verify_vec_e16("vec e16 strided", vec_buf, base_e16_stride, + vec_data16, offsets16, VEC_E16_LEN, + (VEC_E16_LEN - 1U) * stride_h + 2U); + + avl = VEC_E16_LEN; + asm volatile("vsetvli %0, %1, e16, m1, ta, ma" : "=r"(vlen) : "r"(avl)); + asm volatile("vle16.v v0, (%0)" :: "r"(vec_data16) : "memory"); + asm volatile("vle16.v v1, (%0)" :: "r"(vec_idx16) : "memory"); + cyc = benchmark_get_cycle(); + asm volatile("vsuxei16.v v0, (%0), v1" :: "r"(vec_buf + base_e16_index) + : "memory"); + trace_inst("vec e16 indexed", "vsuxei16.v", vec_buf + base_e16_index, cyc); + for (unsigned int i = 0; i < VEC_E16_LEN; i++) + offsets16[i] = vec_idx16[i]; + errors += verify_vec_e16("vec e16 indexed", vec_buf, base_e16_index, + vec_data16, offsets16, VEC_E16_LEN, + offsets16[VEC_E16_LEN - 1U] + 2U); + + (void)vlen; + return errors; +} + int main(void) { const unsigned int core_id = snrt_cluster_core_idx(); @@ -224,6 +421,7 @@ int main(void) { printf("\n"); } +#if ENABLE_SCALAR_TESTS errors += check_store_and_load("sb/lb line0+1", test_buf, 1, 1, 0x80U); errors += check_store_and_load("sh/lh line0+4+2", test_buf + 4, 2, 2, @@ -234,6 +432,11 @@ int main(void) { errors += check_store_and_load("sb/lb line1+3", test_buf + L1LineWidth, 3, 1, 0x7FU); +#endif + +#if ENABLE_VECTOR_TESTS + errors += run_vector_tests(); +#endif if (errors == 0) { printf("Byte-enable test PASSED\n"); From 75a9c9fe138a9ee469acc1ba0d1ce0001105a1dd Mon Sep 17 00:00:00 2001 From: Zexin Fu Date: Tue, 13 Jan 2026 11:53:43 +0100 Subject: [PATCH 6/7] [SCRIPT] Update the auto-benchmark scripts: 1.add new benchmarks; 2. Fix the incorrect exit issue. --- util/auto-benchmark/configs.sh | 4 ++-- util/auto-benchmark/run_all.sh | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/util/auto-benchmark/configs.sh b/util/auto-benchmark/configs.sh index f3013c6..284cd26 100755 --- a/util/auto-benchmark/configs.sh +++ b/util/auto-benchmark/configs.sh @@ -2,7 +2,7 @@ # CONFIGS="cachepool_fpu_512" # KERNELS="spin-lock fdotp-32b_M32768" CONFIGS="cachepool_fpu_128 cachepool_fpu_256 cachepool_fpu_512" -KERNELS="spin-lock fdotp-32b_M8192 fmatmul-32b_M32_N32_K32" -# KERNELS="spin-lock fdotp-32b_M32768 gemv-col_M256_N128_K32 fmatmul-32b_M32_N32_K32 fmatmul-32b_M64_N64_K64" +# KERNELS="spin-lock fdotp-32b_M8192 fmatmul-32b_M32_N32_K32" +KERNELS="spin-lock fdotp-32b_M32768 gemv-col_M256_N128_K32 fmatmul-32b_M32_N32_K32 fmatmul-32b_M64_N64_K64 multi_producer_single_consumer_double_linked_list_M1_N1350_K100 byte-enable" PREFIX="test-cachepool-" # common prefix for all kernels ROOT_PATH=../.. # adjust if needed (path to repo root) diff --git a/util/auto-benchmark/run_all.sh b/util/auto-benchmark/run_all.sh index 77827b7..ff6fba6 100755 --- a/util/auto-benchmark/run_all.sh +++ b/util/auto-benchmark/run_all.sh @@ -1,6 +1,18 @@ #!/usr/bin/env bash +# Refuse to be sourced to avoid killing the interactive shell on errors/interrupts. +if [[ "${BASH_SOURCE[0]}" != "$0" ]]; then + echo "Error: do not source this script; run it as ./run_all.sh" >&2 + return 1 +fi + set -e +cleanup() { + echo + echo "[INFO] Interrupted; stopping batch run." +} +trap 'cleanup; exit 130' INT TERM + # Load user configs source ./configs.sh From 66c4d98067da2936586a33c035ec74f8c410cca9 Mon Sep 17 00:00:00 2001 From: Zexin Fu Date: Tue, 13 Jan 2026 12:36:27 +0100 Subject: [PATCH 7/7] [Lint] Fix a line length exceeds max linting issue. --- hardware/src/cachepool_tile.sv | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index 9dcf9d6..cbc5141 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -830,7 +830,8 @@ module cachepool_tile end // TODO: Should we use a single large bank or multiple narrow ones? - for (genvar bank = 0; bank < NumDataBankPerCtrl/NumWordPerLine; bank++) begin : gen_l1_data_banks + for (genvar bank = 0; bank < (NumDataBankPerCtrl/NumWordPerLine); + bank++) begin : gen_l1_data_banks localparam int unsigned BaseIdx = bank * NumWordPerLine; logic [NumWordPerLine*WordBytes-1:0] bank_be;