diff --git a/hardware/CMakeLists.txt b/hardware/CMakeLists.txt index 893f4f8..118e4bb 100644 --- a/hardware/CMakeLists.txt +++ b/hardware/CMakeLists.txt @@ -9,6 +9,8 @@ find_package(CoyoteHW REQUIRED) set(N_REGIONS 1) set(EN_STRM 1) set(N_STRM_AXI 4) +set(EN_MEM 1) +set(N_CARD_AXI 1) set(FDEV_NAME u55c) validation_checks_hw() diff --git a/hardware/src/hdl/buffer/stream_buffer_interfaces.sv b/hardware/src/hdl/buffer/stream_buffer_interfaces.sv new file mode 100644 index 0000000..f535f1a --- /dev/null +++ b/hardware/src/hdl/buffer/stream_buffer_interfaces.sv @@ -0,0 +1,89 @@ +`timescale 1ns / 1ps + +import libstf::*; + +`include "libstf_macros.svh" + +/* + * This interface links the StreamBufferWriter and StreamBufferWriter, acting as + * a stream where tokens are shared. Each tokens represents the amount of + * bytes written by the latest card write. This is used to pause reads until + * data for a region has been fully written, so that partial data is never + * read. + */ +interface stream_buffer_link_i ( + input logic clk, + input logic rst_n +); + vaddress_t vaddr; + vaddress_t size; + logic last; + logic valid; + logic ready; + + task tie_off_m(); + valid = 1'b0; + endtask + + task tie_off_s(); + ready = 1'b1; + endtask + + modport m ( + import tie_off_m, + output vaddr, size, last, valid, + input ready + ); + + modport s ( + import tie_off_s, + output ready, + input vaddr, size, last, valid + ); + +`ifndef SYNTHESIS + `STF_ASSERT_SIGNAL_STABLE(vaddr); + `STF_ASSERT_SIGNAL_STABLE(size); + `STF_ASSERT_SIGNAL_STABLE(last); + + `STF_ASSERT_NOT_UNDEFINED(valid); + `STF_ASSERT_NOT_UNDEFINED(ready); +`endif +endinterface + +interface mem_read_config_i ( + input logic clk, + input logic rst_n +); + vaddress_t vaddr; + data32_t size; + logic valid; + logic ready; + + task tie_off_m(); + valid = 1'b0; + endtask + + task tie_off_s(); + ready = 1'b1; + endtask + + modport m ( + import tie_off_m, + output vaddr, size, valid, + input ready + ); + + modport s ( + import tie_off_s, + output ready, + input vaddr, size, valid + ); + +`ifndef SYNTHESIS + `STF_ASSERT_STABLE(vaddr, valid, ready); + `STF_ASSERT_STABLE(size, valid, ready); + `STF_ASSERT_NOT_UNDEFINED(valid); + `STF_ASSERT_NOT_UNDEFINED(ready); +`endif +endinterface diff --git a/hardware/src/hdl/buffer/stream_buffer_reader.sv b/hardware/src/hdl/buffer/stream_buffer_reader.sv new file mode 100644 index 0000000..5df4f89 --- /dev/null +++ b/hardware/src/hdl/buffer/stream_buffer_reader.sv @@ -0,0 +1,86 @@ +`timescale 1ns / 1ps + +import lynxTypes::*; +import libstf::*; + +`include "axi_macros.svh" +`include "lynx_macros.svh" +`include "libstf_macros.svh" + +/* + * NOTE: the input_data should be wired to the AXI stream where incmonig data + * will be streamed after a request has been sent via sq_rd. + * For example, in the case of card memory, it should be + * axis_card_recv[AXI_STRM_ID]. + * NOTE: the TRANSFER_LENGTH_BYTES must be the same as configured in the + * writer. + */ +module StreamBufferReader #( + parameter AXI_STRM_ID = 0, + parameter TRANSFER_SIZE = TRANSFER_SIZE_BYTES +) ( + input logic clk, + input logic rst_n, + + metaIntf.m sq_rd, + metaIntf.s cq_rd, + + stream_buffer_link_i.s link, + + AXI4SR.s in, + AXI4S.m out +); + +`RESET_RESYNC // Reset pipelining + +mem_read_config_i mem_config (.clk(clk), .rst_n(reset_synced)); + +assign mem_config.vaddr = link.vaddr; +assign mem_config.size = link.size; +assign mem_config.valid = link.valid; +assign link.ready = mem_config.ready; + +AXI4S inner_out (.aclk(clk), .aresetn(reset_synced)); + +StreamReader #( + .STRM(STRM_CARD), + .AXI_STRM_ID(AXI_STRM_ID), + .TRANSFER_LENGTH_BYTES(TRANSFER_SIZE) +) inst_stream_reader ( + .clk(clk), + .rst_n(reset_synced), + + .sq_rd(sq_rd), + .cq_rd(cq_rd), + + .mem_config(mem_config), + + .input_data(in), + .output_data(inner_out) +); + +logic last_received, n_last_received; + +always_ff @(posedge clk) begin + if (reset_synced == 1'b0) begin + last_received <= 1'b0; + end else begin + last_received <= n_last_received; + end +end + +always_comb begin + n_last_received = last_received; + + if (link.ready && link.valid) begin + n_last_received = link.last; + end +end + +assign out.tdata = inner_out.tdata; +assign out.tkeep = inner_out.tkeep; +assign out.tlast = inner_out.tlast && last_received; +assign out.tvalid = inner_out.tvalid; +assign inner_out.tready = out.tready; + +endmodule diff --git a/hardware/src/hdl/buffer/stream_buffer_writer.sv b/hardware/src/hdl/buffer/stream_buffer_writer.sv new file mode 100644 index 0000000..1588369 --- /dev/null +++ b/hardware/src/hdl/buffer/stream_buffer_writer.sv @@ -0,0 +1,141 @@ +`timescale 1ns / 1ps + +import libstf::*; +import lynxTypes::*; + +`include "axi_macros.svh" +`include "libstf_macros.svh" + +/* + * NOTE: out must be wired to axis_card_send[AXI_STRM_ID]. + */ +module StreamBufferWriter #( + parameter AXI_STRM_ID = 0, + parameter TRANSFER_SIZE = TRANSFER_SIZE_BYTES, + // NOTE: this is the number of tranfers that will be allocated at a time + // when more memory is provided to the underlying StreamWriter. + parameter TRANFERS_PER_ALLOCATION = MAXIMUM_HOST_ALLOCATION_SIZE_BYTES / TRANSFER_SIZE +) ( + input logic clk, + input logic rst_n, + + metaIntf.m sq_wr, + metaIntf.s cq_wr, + + AXI4S.s in, + + stream_buffer_link_i.m link, + AXI4SR.m out +); + +`RESET_RESYNC // Reset pipelining + +localparam int ALLOCATION_BYTES = TRANFERS_PER_ALLOCATION * TRANSFER_SIZE; + +// This stream is used on the host to allocate more data for the StreamWriter. +// On card memory, there's no need for allocations, new mem_config_i regions +// are provided when requested from the code below. Thus, we can just tie off +// this signal. +stream_writer_notify_i notify (.clk(clk), .rst_n(reset_synced)); +mem_config_i mem_config (.clk(clk), .rst_n(reset_synced)); +vaddress_t next_vaddr, next_buffer_vaddr, last_allocation_end_vaddr; + +buffer_t next_buffer; +assign next_buffer.vaddr = next_vaddr; +assign next_buffer.size = TRANFERS_PER_ALLOCATION; +assign mem_config.flush_buffers = 1'b0; +assign mem_config.buffer_data = next_buffer; +assign mem_config.buffer_valid = 1'b1; + +// This state machine ensures that the notification of a compled write is +// received by the consumer on the other end of the link. It also ensures that +// when the current memory allocation is exhausted, a new memory allocation is +// provided on the mem_config interface and acknowledged. +typedef enum logic { + ST_NOT_FULL, + ST_NEEDS_ALLOCATION +} state_t; +state_t state; + +vaddress_t n_next_vaddr; +assign n_next_vaddr = next_vaddr + notify.size; + +always_ff @(posedge clk) begin + if (reset_synced == 1'b0) begin + next_vaddr <= '0; + next_buffer_vaddr <= '0; + last_allocation_end_vaddr <= '0; + + state <= ST_NEEDS_ALLOCATION; + end else begin + case (state) + ST_NOT_FULL: begin + if (notify.ready && notify.valid) begin + next_vaddr <= n_next_vaddr; + + // When we receive a last, the writer is going to assume + // that we want a new allocation for the next stream. This + // is the case when sending data to the host, but not when + // sending data to the card (we don't want to waste memory + // for a new allocation, leaving the current one half-used). + if (notify.last || n_next_vaddr >= last_allocation_end_vaddr) begin + state <= ST_NEEDS_ALLOCATION; + end + end + end + + ST_NEEDS_ALLOCATION: begin + if (mem_config.buffer_ready) begin + next_buffer_vaddr <= next_buffer_vaddr + ALLOCATION_BYTES; + last_allocation_end_vaddr <= next_vaddr + ALLOCATION_BYTES; + + state <= ST_NOT_FULL; + end + end + endcase + end +end + +always_comb begin + if (reset_synced == 1'b0) begin + notify.ready = 1'b0; + mem_config.buffer_valid = 1'b0; + end else begin + case (state) + ST_NOT_FULL: begin + notify.ready = link.ready; + mem_config.buffer_valid = 1'b0; + end + + ST_NEEDS_ALLOCATION: begin + notify.ready = 1'b0; + mem_config.buffer_valid = 1'b1; + end + endcase + end + + link.vaddr = next_vaddr; + link.size = notify.size; + link.last = notify.last; + link.valid = state == ST_NOT_FULL && notify.valid; +end + +StreamWriter #( + .STRM(STRM_CARD), + .AXI_STRM_ID(AXI_STRM_ID), + .TRANSFER_LENGTH_BYTES(TRANSFER_SIZE) +) inst_stream_writer ( + .clk(clk), + .rst_n(reset_synced), + + .sq_wr(sq_wr), + .cq_wr(cq_wr), + + .notify(notify), + .mem_config(mem_config), + + .input_data(in), + .output_data(out) +); + +endmodule diff --git a/hardware/src/hdl/buffer/stream_reader.sv b/hardware/src/hdl/buffer/stream_reader.sv new file mode 100644 index 0000000..12ac0f9 --- /dev/null +++ b/hardware/src/hdl/buffer/stream_reader.sv @@ -0,0 +1,221 @@ +`timescale 1ns / 1ps + +import lynxTypes::*; +import libstf::*; + +`include "axi_macros.svh" +`include "lynx_macros.svh" +`include "libstf_macros.svh" + +/* + * NOTE: the input_data should be wired to the AXI stream where incmonig data + * will be streamed after a request has been sent via sq_rd. + * For example, in the case of card memory, it should be + * axis_card_recv[AXI_STRM_ID]. + * NOTE: the TRANSFER_LENGTH_BYTES must be the same as configured in the + * writer. + */ +module StreamReader #( + parameter STRM = STRM_HOST, + parameter AXI_STRM_ID = 0, + parameter IS_LOCAL = 1, + parameter TRANSFER_LENGTH_BYTES = 4096 +) ( + input logic clk, + input logic rst_n, + + metaIntf.m sq_rd, + metaIntf.s cq_rd, + + mem_read_config_i.s mem_config, + + AXI4SR.s input_data, + AXI4S.m output_data +); + +`RESET_RESYNC // Reset pipelining + +metaIntf #(.STYPE(req_t)) debug_sq_rd (.aclk(clk), .aresetn(reset_synced)); +assign debug_sq_rd.data = sq_rd.data; +assign debug_sq_rd.valid = sq_rd.valid; +assign debug_sq_rd.ready = sq_rd.ready; +metaIntf #(.STYPE(req_t)) debug_cq_rd (.aclk(clk), .aresetn(reset_synced)); +assign debug_cq_rd.data = cq_rd.data; +assign debug_cq_rd.valid = cq_rd.valid; +assign debug_cq_rd.ready = cq_rd.ready; + +// -- Parameters ----------------------------------------------------------------------------------- +localparam RDMA_READ = 7; +localparam OPCODE = IS_LOCAL ? LOCAL_READ : RDMA_READ; +// How many bits we need to address one transfer of size TRANSFER_LENGTH_BYTES +localparam TRANSFER_ADDRESS_LEN_BITS = $clog2(TRANSFER_LENGTH_BYTES) + 1; +localparam AXI_DATA_BYTES = (AXI_DATA_BITS / 8); + +// -- Assertions ----------------------------------------------------------------------------------- + +// TRANSFER_LENGTH_BYTES must be a multiple of AXI_DATA_BYTES +`ASSERT_ELAB(TRANSFER_LENGTH_BYTES % AXI_DATA_BYTES == 0) +// This limitations is because we support only 3 bits for the stream identifier in the +// interrupt/notify value +`ASSERT_ELAB(N_STRM_AXI <= 8) + +// Input stream +assert property (@(posedge clk) disable iff (!reset_synced) + !input_data.tvalid || input_data.tlast || &input_data.tkeep) +else $fatal(1, "Non-last keep signal (%h) must be all 1s!", input_data.tkeep); +assert property (@(posedge clk) disable iff (!reset_synced) + !input_data.tvalid || !input_data.tlast || $onehot0(input_data.tkeep + 1'b1)) +else $fatal(1, "Last keep signal (%h) must be contiguous starting from the least significant bit!", input_data.tkeep); + +// Allocations +assert property (@(posedge clk) disable iff (!reset_synced) + !mem_config.valid || (mem_config.size > 0)) +else $fatal(1, "Buffer size (%0d) must be > 0!", mem_config.size); + +// -- Input logic --------------------------------------------------------------------------------- +typedef enum logic[2:0] { + WAIT_FOR_BUFFER = 0, + REQUEST = 1, + TRANSFER = 2 +} input_state_t; +input_state_t input_state, n_input_state; + +// The vaddr we currently read from +vaddress_t vaddr, n_vaddr; +// How many bytes we need to read from vaddr. +data32_t len, n_len; +// How many bytes we need to read from vaddr in the next transfer request. +data32_t next_transfer_len, n_next_transfer_len; +// How many bytes we need to read in the ongoing transfer. +data32_t curr_transfer_len, n_curr_transfer_len; +assign mem_config.ready = input_state == WAIT_FOR_BUFFER; + +// Tracking of the amount of data we have written in the current transfer +localparam BEAT_BITS = $clog2(AXI_DATA_BYTES); +localparam TRANSFER_BEAT_COUNTER_WIDTH = TRANSFER_ADDRESS_LEN_BITS - BEAT_BITS; +logic[TRANSFER_BEAT_COUNTER_WIDTH - 1 : 0] beats_read_from_transfer, n_beats_read_from_transfer, beats_read_from_transfer_succ; +logic has_partial_beat, current_transfer_completed; +assign has_partial_beat = |(curr_transfer_len[BEAT_BITS - 1:0]); +assign current_transfer_completed = beats_read_from_transfer_succ == (curr_transfer_len >> BEAT_BITS) + has_partial_beat; +assign beats_read_from_transfer_succ = beats_read_from_transfer + 1; + +// Completions we get +assign cq_rd.ready = 1; +logic is_completion; +// Note: We used to also validate the OP code here. However, the op code is not set correctly by coyote for +// the cq_rd. Therefore, we only validate the strm & dest. This should however never cause any problems! +assign is_completion = cq_rd.valid && cq_rd.data.strm == STRM && cq_rd.data.dest == AXI_STRM_ID; + +// -- Send queue requests -------------------------------------------------------------------------- +// Sends a request over transfers with at most TRANSFER_LENGTH_BYTES +always_comb begin + sq_rd.data = '0; // Null everything else + + sq_rd.data.opcode = OPCODE; + sq_rd.data.strm = STRM; + sq_rd.data.mode = ~IS_LOCAL; + sq_rd.data.rdma = ~IS_LOCAL; + sq_rd.data.remote = ~IS_LOCAL; + + // Note: We always send to coyote thread id 0. + sq_rd.data.pid = 0; + sq_rd.data.dest = AXI_STRM_ID; + + sq_rd.data.vaddr = vaddr; + sq_rd.data.len = next_transfer_len; + + // We always mark the transfer as last so we get + // one acknowledgement per transfer! + sq_rd.data.last = 1; + + sq_rd.valid = input_state == REQUEST; +end + +// -- State machine -------------------------------------------------------------------------------- +always_ff @(posedge clk) begin + if (reset_synced == 1'b0) begin + input_state <= WAIT_FOR_BUFFER; + end else begin + beats_read_from_transfer <= n_beats_read_from_transfer; + vaddr <= n_vaddr; + len <= n_len; + next_transfer_len <= n_next_transfer_len; + curr_transfer_len <= n_curr_transfer_len; + + input_state <= n_input_state; + end +end + +always_comb begin + n_beats_read_from_transfer = beats_read_from_transfer; + n_vaddr = vaddr; + n_len = len; + n_next_transfer_len = next_transfer_len; + n_curr_transfer_len = curr_transfer_len; + + n_input_state = input_state; + + case(input_state) + WAIT_FOR_BUFFER: begin + if (mem_config.valid) begin + // Reset the current state + n_vaddr = mem_config.vaddr; + n_len = mem_config.size; + if (n_len > TRANSFER_LENGTH_BYTES) begin + n_next_transfer_len = TRANSFER_LENGTH_BYTES; + end else begin + n_next_transfer_len = n_len; + end + + n_input_state = REQUEST; + end end + REQUEST: begin + if (sq_rd.ready) begin + n_beats_read_from_transfer = '0; + n_vaddr = vaddr + next_transfer_len; + // Possible optimiaztion: subtract constant TRANSFER_LENGTH_BYTES + // which is valid except for the last transfer. Deal with overflows. + n_len = len - next_transfer_len; + n_curr_transfer_len = next_transfer_len; + if (n_len > TRANSFER_LENGTH_BYTES) begin + n_next_transfer_len = TRANSFER_LENGTH_BYTES; + end else begin + n_next_transfer_len = n_len; + end + + n_input_state = TRANSFER; + end end + TRANSFER: begin + if (input_data.tvalid && input_data.tready) begin + // If this was the last data beat of the transfer + if (current_transfer_completed) begin + if (len == 0) begin + n_input_state = WAIT_FOR_BUFFER; + end else begin + // Perform next transfer! + n_input_state = REQUEST; + end + end else begin + n_beats_read_from_transfer = beats_read_from_transfer_succ; + end + end end + default:; + endcase +end + +// -- Assign output data --------------------------------------------------------------------------- +AXI4S internal_data(.aclk(clk), .aresetn(reset_synced)); +AXI4S output_axis (.aclk(clk), .aresetn(reset_synced)); + +assign internal_data.tdata = input_data.tdata; + +assign internal_data.tkeep = input_data.tkeep; +assign internal_data.tlast = current_transfer_completed; +assign internal_data.tvalid = input_state == TRANSFER && input_data.tvalid; +assign input_data.tready = input_state == TRANSFER && internal_data.tready; + +AXISkidBuffer inst_skid_buffer (.clk(clk), .rst_n(reset_synced), .in(internal_data), .out(output_axis)); + +`AXIS_ASSIGN(output_axis, output_data); + +endmodule diff --git a/hardware/src/hdl/config/config_interfaces.sv b/hardware/src/hdl/config/config_interfaces.sv index 8466617..3dd290f 100644 --- a/hardware/src/hdl/config/config_interfaces.sv +++ b/hardware/src/hdl/config/config_interfaces.sv @@ -114,7 +114,7 @@ endinterface /** * Interface that bundles all memory configuration information for the OutputWriter module. */ -interface mem_config_i( +interface mem_config_i ( input logic clk, input logic rst_n ); diff --git a/hardware/src/hdl/config/registers/config_read_register_file.sv b/hardware/src/hdl/config/registers/config_read_register_file.sv index 790bdc4..1bff3c9 100644 --- a/hardware/src/hdl/config/registers/config_read_register_file.sv +++ b/hardware/src/hdl/config/registers/config_read_register_file.sv @@ -16,7 +16,7 @@ module ConfigReadRegisterFile #( ); // -- Assertions ----------------------------------------------------------------------------------- -`ASSERT_ELAB(NUM_REGS > 1) +`ASSERT_ELAB(NUM_REGS > 0) // -- Signals -------------------------------------------------------------------------------------- typedef enum logic {WAIT, RESPOND} state_t; diff --git a/hardware/src/hdl/output/notify_meta_intf_adapter.sv b/hardware/src/hdl/output/notify_meta_intf_adapter.sv new file mode 100644 index 0000000..9035394 --- /dev/null +++ b/hardware/src/hdl/output/notify_meta_intf_adapter.sv @@ -0,0 +1,24 @@ +`timescale 1ns / 1ps + +module NotifyMetaIntfAdapter #( + parameter AXI_STRM_ID = 0, + parameter PID = 0 +) ( + stream_writer_notify_i.s in, + metaIntf.m out +); + +always_comb begin +// The output value has 32 bits and consists of: +out.data.pid = 6'd0; +// 1. The stream id that finished the transfer +out.data.value[2:0] = AXI_STRM_ID; +// 2. How much data as written to the vaddr (at most 2^28 bytes are supported) +out.data.value[30:3] = in.size; +// 3. Whether this was the last transfer, i.e. all output data was written +out.data.value[31] = in.last; +out.valid = in.valid; +in.ready = out.ready; +end + +endmodule diff --git a/hardware/src/hdl/output/output_writer.sv b/hardware/src/hdl/output/output_writer.sv index b3f54bc..534a9b9 100644 --- a/hardware/src/hdl/output/output_writer.sv +++ b/hardware/src/hdl/output/output_writer.sv @@ -79,6 +79,8 @@ MetaIntfArbiter #( // -- FPGA-initiated transfers --------------------------------------------------------------------- for(genvar I = 0; I < N_STRM_AXI; I++) begin `ifndef DISABLE_OUTPUT_WRITER + stream_writer_notify_i notify (.clk(clk), .rst_n(reset_synced)); + // Invoke the FPGA-initiated transfers for this stream StreamWriter #( .AXI_STRM_ID(I), @@ -89,13 +91,20 @@ for(genvar I = 0; I < N_STRM_AXI; I++) begin .sq_wr(sq_wr_strm[I]), .cq_wr(cq_wr_strm[I]), - .notify(notify_strm[I]), + .notify(notify), .mem_config(mem_config[I]), .input_data(data_in[I]), .output_data(data_out[I]) ); + + NotifyMetaIntfAdapter #( + .AXI_STRM_ID(I) + ) inst_notify_adapter ( + .in(notify), + .out(notify_strm[I]) + ); `else // Tie of the interfaces we don't need always_comb sq_wr_strm [I].tie_off_m(); diff --git a/hardware/src/hdl/output/stream_writer.sv b/hardware/src/hdl/output/stream_writer.sv index 0700707..6ba3e9c 100644 --- a/hardware/src/hdl/output/stream_writer.sv +++ b/hardware/src/hdl/output/stream_writer.sv @@ -6,6 +6,7 @@ import libstf::*; `include "axi_macros.svh" `include "lynx_macros.svh" `include "libstf_macros.svh" +`include "config_macros.svh" /** * This module takes the data from input_data and transfers it to the memory regions provided by @@ -40,8 +41,9 @@ module StreamWriter #( metaIntf.m sq_wr, metaIntf.s cq_wr, - metaIntf.m notify, // This module triggers an interrupt when all transfers are done + stream_writer_notify_i .m notify, // This module triggers an interrupt when more + // memory is required or all transfers are completed mem_config_i.s mem_config, AXI4S.s input_data, @@ -159,13 +161,14 @@ FIFO #( // -- Output logic --------------------------------------------------------------------------------- typedef enum logic[2:0] { - WAIT_FOR_BUFFER = 0, - REQUEST = 1, - TRANSFER = 2, - WAIT_COMPLETION = 3, - WAIT_NOTIFY = 4, - ALL_DONE = 5, - FLUSH_BUFFERS = 6 + RESET = 0, + WAIT_FOR_BUFFER = 1, + REQUEST = 2, + TRANSFER = 3, + WAIT_COMPLETION = 4, + WAIT_NOTIFY = 5, + ALL_DONE = 6, + FLUSH_BUFFERS = 7 } output_state_t; output_state_t output_state, n_output_state; @@ -235,14 +238,8 @@ assign all_transfers_completed = num_completed_transfers == num_requests; logic last_transfer, n_last_transfer; always_comb begin - notify.data.pid = 6'd0; - // The output value has 32 bits and consists of: - // 1. The stream id that finished the transfer - notify.data.value[2:0] = AXI_STRM_ID; - // 2. How much data as written to the vaddr (at most 2^28 bytes are supported) - notify.data.value[30:3] = bytes_written_to_allocation; - // 3. Whether this was the last transfer, i.e. all output data was written - notify.data.value[31] = last_transfer; + notify.size = bytes_written_to_allocation; + notify.last = last_transfer; notify.valid = (output_state == WAIT_COMPLETION && all_transfers_completed) || (output_state == WAIT_NOTIFY); end @@ -250,7 +247,7 @@ end // -- State machine -------------------------------------------------------------------------------- always_ff @(posedge clk) begin if (reset_synced == 1'b0) begin - output_state <= WAIT_FOR_BUFFER; + output_state <= RESET; end else begin bytes_written_to_allocation <= n_bytes_written_to_allocation; beats_written_to_transfer <= n_beats_written_to_transfer; @@ -284,6 +281,9 @@ always_comb begin end case(output_state) + RESET: begin + n_output_state = WAIT_FOR_BUFFER; + end WAIT_FOR_BUFFER: begin if (buffer.valid) begin // Reset the current state diff --git a/hardware/src/hdl/output/stream_writer_interface.sv b/hardware/src/hdl/output/stream_writer_interface.sv new file mode 100644 index 0000000..ddd3850 --- /dev/null +++ b/hardware/src/hdl/output/stream_writer_interface.sv @@ -0,0 +1,52 @@ +`timescale 1ns / 1ps + +import libstf::*; + +`include "libstf_macros.svh" + +/* + * This interface is used to notify of a completed transfer or request more + * memory allocations on the target device. The StreamWriter will place a + * request on this interface and wait for acknowledgement. + * + * In this context, acknowledgement implies that the request has been received, + * not that a memory segment has been allocated. That is communicated to the + * StreamWriter via the mem_config_i interface. + */ +interface stream_writer_notify_i ( + input logic clk, + input logic rst_n +); + vaddress_t size; + logic last; + logic valid; + logic ready; + + task tie_off_m(); + valid = 1'b0; + endtask + + task tie_off_s(); + ready = 1'b1; + endtask + + modport m ( + import tie_off_m, + output size, last, valid, + input ready + ); + + modport s ( + import tie_off_s, + output ready, + input size, last, valid + ); + +`ifndef SYNTHESIS + `STF_ASSERT_SIGNAL_STABLE(size); + `STF_ASSERT_SIGNAL_STABLE(last); + + `STF_ASSERT_NOT_UNDEFINED(valid); + `STF_ASSERT_NOT_UNDEFINED(ready); +`endif +endinterface diff --git a/hardware/unit-tests/stream_buffer_test.py b/hardware/unit-tests/stream_buffer_test.py new file mode 100644 index 0000000..12ef78c --- /dev/null +++ b/hardware/unit-tests/stream_buffer_test.py @@ -0,0 +1,72 @@ +from typing import List +from coyote_test import constants, fpga_test_case +from unit_test.fpga_stream import Stream, StreamType + +MAX_NUMBER_STREAMS = constants.MAX_NUMBER_STREAMS +TRANSFER_SIZE_BYTES_OVERWRITE = "TRANSFER_SIZE_BYTES_OVERWRITE" + +class StreamBufferTest(fpga_test_case.FPGATestCase): + """ + Tests the behavior of the HBM caching of AXI streams. + """ + + alternative_vfpga_top_file = "vfpga_tops/stream_buffer_test.sv" + debug_mode = True + verbose_logging = True + + def setUp(self): + super().setUp() + # Data to write to the HBM, sequentially as provided in this list + self.writes: List[Stream] = [] + + def simulate_fpga(self): + card_allocation_size = 256 * 1024 * 1024 + assert len(self.writes) > 0, "Cannot perform output test with 0 writes" + + for stream in self.writes: + self.set_stream_input(0, stream) + + # NOTE: it is important that the expected outputs are set after the inputs + # as this affects the order of the mappings, which in turn affects the mappings + # on the card memory, which MUST match the ones on the host input. + # + # If this is not the case, the writes would be unaligned with the allocated + # memory segments, and even if the memory region overall is allocated, + # it would be split in different segments, thus raising an exception: + # + # stream_simulation.svh:129: CARD[0]: No segment found to write data to in memory. + # + for stream in self.writes: + self.set_expected_output(0, stream) + + return super().simulate_fpga() + + def test_one_write_read(self): + l = 128 + self.writes.append(Stream(StreamType.UNSIGNED_INT_64, list(range(0, l)))) + + # Act + self.simulate_fpga() + + # Assert + self.assert_simulation_output() + + def test_two_write_read(self): + self.writes.append(Stream(StreamType.UNSIGNED_INT_64, list(range(0, 64)))) + self.writes.append(Stream(StreamType.UNSIGNED_INT_64, list(range(0, 23)))) + + # Act + self.simulate_fpga() + + # Assert + self.assert_simulation_output() + + def test_many_small_write_read(self): + for l in range(10, 30): + self.writes.append(Stream(StreamType.UNSIGNED_INT_8, list(range(0, l)))) + + # Act + self.simulate_fpga() + + # Assert + self.assert_simulation_output() diff --git a/hardware/unit-tests/vfpga_tops/dict_test.sv b/hardware/unit-tests/vfpga_tops/dict_test.sv index a4ad4c3..4916c51 100644 --- a/hardware/unit-tests/vfpga_tops/dict_test.sv +++ b/hardware/unit-tests/vfpga_tops/dict_test.sv @@ -15,6 +15,10 @@ always_comb cq_wr.tie_off_s(); for (genvar I = 2; I < N_STRM_AXI; I++) begin always_comb axis_host_recv[I].tie_off_s(); end +for (genvar I = 0; I < N_CARD_AXI; I++) begin + always_comb axis_card_send[I].tie_off_m(); + always_comb axis_card_recv[I].tie_off_s(); +end // -- Types ---------------------------------------------------------------------------------------- typedef logic[0:0] select_t; diff --git a/hardware/unit-tests/vfpga_tops/normalizer_test.sv b/hardware/unit-tests/vfpga_tops/normalizer_test.sv index f55a7da..7cb1e56 100644 --- a/hardware/unit-tests/vfpga_tops/normalizer_test.sv +++ b/hardware/unit-tests/vfpga_tops/normalizer_test.sv @@ -14,6 +14,10 @@ for (genvar I = 1; I < N_STRM_AXI; I++) begin always_comb axis_host_send[I].tie_off_m(); always_comb axis_host_recv[I].tie_off_s(); end +for (genvar I = 0; I < N_CARD_AXI; I++) begin + always_comb axis_card_send[I].tie_off_m(); + always_comb axis_card_recv[I].tie_off_s(); +end // -- Fix clock and reset names -------------------------------------------------------------------- logic clk; diff --git a/hardware/unit-tests/vfpga_tops/output_writer_test.sv b/hardware/unit-tests/vfpga_tops/output_writer_test.sv index d5d4988..e56a3b6 100644 --- a/hardware/unit-tests/vfpga_tops/output_writer_test.sv +++ b/hardware/unit-tests/vfpga_tops/output_writer_test.sv @@ -1,8 +1,12 @@ - // -- Tie-off unused interfaces and signals -------------------------------------------------------- always_comb sq_rd.tie_off_m(); always_comb cq_rd.tie_off_s(); +for (genvar I = 0; I < N_CARD_AXI; I++) begin + always_comb axis_card_send[I].tie_off_m(); + always_comb axis_card_recv[I].tie_off_s(); +end + // -- Fix clock and reset names -------------------------------------------------------------------- logic clk; logic rst_n; diff --git a/hardware/unit-tests/vfpga_tops/sorted_seq_to_bitmask_test.sv b/hardware/unit-tests/vfpga_tops/sorted_seq_to_bitmask_test.sv index 7b061e4..7b59a8c 100644 --- a/hardware/unit-tests/vfpga_tops/sorted_seq_to_bitmask_test.sv +++ b/hardware/unit-tests/vfpga_tops/sorted_seq_to_bitmask_test.sv @@ -14,6 +14,10 @@ for (genvar I = 1; I < N_STRM_AXI; I++) begin always_comb axis_host_recv[I].tie_off_s(); always_comb axis_host_send[I].tie_off_m(); end +for (genvar I = 0; I < N_CARD_AXI; I++) begin + always_comb axis_card_send[I].tie_off_m(); + always_comb axis_card_recv[I].tie_off_s(); +end // -- Fix clock and reset names -------------------------------------------------------------------- logic clk; diff --git a/hardware/unit-tests/vfpga_tops/stream_buffer_test.sv b/hardware/unit-tests/vfpga_tops/stream_buffer_test.sv new file mode 100644 index 0000000..8ee791e --- /dev/null +++ b/hardware/unit-tests/vfpga_tops/stream_buffer_test.sv @@ -0,0 +1,61 @@ +// -- Tie-off unused interfaces and signals -------------------------------------------------------- +always_comb axi_ctrl.tie_off_s(); +always_comb notify.tie_off_m(); + +for (genvar I = 1; I < N_STRM_AXI; I++) begin + always_comb axis_host_send[I].tie_off_m(); + always_comb axis_host_recv[I].tie_off_s(); +end +for (genvar I = 1; I < N_CARD_AXI; I++) begin + always_comb axis_card_send[I].tie_off_m(); + always_comb axis_card_recv[I].tie_off_s(); +end + +// -- Fix clock and reset names -------------------------------------------------------------------- +logic clk; +logic rst_n; + +assign clk = aclk; +assign rst_n = aresetn; + +// -- Input ---------------------------------------------- +AXI4S axi_host_send_0(.aclk(clk), .aresetn(rst_n)); +`AXIS_ASSIGN(axi_host_send_0, axis_host_send[0]) // AXI4SR to AXI4S + +// -- Output --------------------------------------------- +AXI4S axi_host_recv_0(.aclk(clk), .aresetn(rst_n)); +`AXIS_ASSIGN(axis_host_recv[0], axi_host_recv_0) // AXI4SR to AXI4S + +// -- Design wiring -------------------------------------- +stream_buffer_link_i link (); + +StreamBufferWriter #( + .TRANSFER_SIZE(64), + .TRANFERS_PER_ALLOCATION(1) +) inst_stream_buffer_writer ( + .clk(aclk), + .rst_n(rst_n), + + .sq_wr(sq_wr), + .cq_wr(cq_wr), + + .link(link), + + .in(axi_host_recv_0), + .out(axis_card_send[0]) +); + +StreamBufferReader #( + .TRANSFER_SIZE(64) +) inst_stream_buffer_reader ( + .clk(aclk), + .rst_n(rst_n), + + .sq_rd(sq_rd), + .cq_rd(cq_rd), + + .link(link), + + .in(axis_card_recv[0]), + .out(axi_host_send_0) +); diff --git a/hardware/unit-tests/vfpga_tops/typed_dict_test.sv b/hardware/unit-tests/vfpga_tops/typed_dict_test.sv index b263619..2add83a 100644 --- a/hardware/unit-tests/vfpga_tops/typed_dict_test.sv +++ b/hardware/unit-tests/vfpga_tops/typed_dict_test.sv @@ -16,6 +16,10 @@ always_comb cq_wr.tie_off_s(); for (genvar I = 2; I < N_STRM_AXI; I++) begin always_comb axis_host_recv[I].tie_off_s(); end +for (genvar I = 0; I < N_CARD_AXI; I++) begin + always_comb axis_card_send[I].tie_off_m(); + always_comb axis_card_recv[I].tie_off_s(); +end // -- Types ---------------------------------------------------------------------------------------- typedef logic[0:0] select_t;