diff --git a/README.md b/README.md
index 3bfc018..9838da9 100644
--- a/README.md
+++ b/README.md
@@ -35,11 +35,35 @@ The paper proposes embedding thousands of these security blocks throughout an AI
 
 ## Quickstart
 
-### Prerequisites
+### REVISIT Verilog (SystemVerilog + Verilator)
+
+#### Prerequisites
+
+- **Verilator** (5.x+)
+
+#### Run Tests
+
+```bash
+cd verilog
+
+# Run the full security block test suite (14 tests)
+make sim TB=top
+
+# Run individual test benches
+make sim TB=ecdsa
+make sim TB=arith
+
+# Lint
+make lint
+```
+
+### HardCaml (OCaml reference model)
+
+#### Prerequisites
 
 - **OCaml** (4.14+) and **opam**
 
-### Installation
+#### Installation
 
 ```bash
 # Install opam if needed (macOS: brew install opam, Ubuntu: apt install opam)
@@ -47,7 +71,7 @@ opam init
 eval $(opam env)
 
 # Install dependencies
-opam install hardcaml hardcaml_waveterm ppx_hardcaml zarith 
+opam install hardcaml hardcaml_waveterm ppx_hardcaml zarith
 
 # Clone and build
 git clone https://github.com/JamesPetrie/off-switch
@@ -55,7 +79,7 @@ cd off-switch
 dune build
 ```
 
-### Run Tests
+#### Run Tests
 
 ```bash
 # Run security block test suite
@@ -90,9 +114,9 @@ flowchart TB
         end
 
         SL -->|request_new| TRNG
-        TRNG -->|"nonce, valid"| SL
-        SL -->|start| ECDSA
-        ECDSA -->|"done, valid"| SL
+        TRNG -->|"nonce_valid, nonce"| SL
+        SL -->|valid| ECDSA
+        ECDSA -->|"ready, verif_passed"| SL
         SL -->|increment| ALLOW
         ALLOW -->|enabled| AND
         ADDER --> AND
@@ -104,9 +128,9 @@ flowchart TB
         WOUT["Workload<br/>Output"]:::external
     end
 
-    AUTH <-->|"license_submit, r, s<br/>nonce, ready"| SL
-    WIN --> ADDER
-    AND --> WOUT
+    AUTH <-->|"license_valid, r, s<br/>nonce_ready, nonce,  license_ready"| SL
+    WIN -->|"workload_valid, workload_a, workload_b"| ADDER
+    AND -->|"result_valid, workload_result"| WOUT
 
     classDef external fill:#fff,stroke:#333,stroke-dasharray: 5 5
     classDef security fill:#cce5ff,stroke:#004085
@@ -122,9 +146,9 @@ flowchart TB
 
 | Module | Type | Purpose |
 |--------|------|---------|
-| `Trng` | Submodule | Nonce generation (256-bit counter in prototype; ring oscillator in production) |
-| `Ecdsa` | Submodule | Signature verification using secp256k1 curve |
-| Security Logic | Inline | State machine orchestration (7 states) |
+| `trng` | Submodule | Nonce generation (256-bit counter in prototype; ring oscillator in production) |
+| `ecdsa` | Submodule | Signature verification using secp256k1 curve |
+| Security Logic | Inline | State machine orchestration (5 states) |
 | Usage Allowance | Inline | 64-bit authorization counter |
 | Workload | Inline | Gated essential operation (Int8 Add example) |
 
@@ -139,7 +163,7 @@ The authorization protocol follows Section 2 of the paper (see Figure 2):
 1. TRNG generates nonce (at initialization or after valid license)
 2. Security Logic latches and publishes nonce (`nonce_ready` = 1)
 3. External authority reads nonce, signs it with private key
-4. Authority submits license (r, s) via `license_submit` pulse
+4. Authority submits license (r, s) via valid-ready handshake (`license_valid`/`license_ready`)
 5. ECDSA verifies signature against nonce and hardcoded public key
 6. **If valid:**
    - Allowance incremented
@@ -151,7 +175,7 @@ The authorization protocol follows Section 2 of the paper (see Figure 2):
 
 ### Workload Flow
 
-1. Workload inputs (`int8_a`, `int8_b`) arrive with `workload_valid` = 1
+1. Workload inputs (`workload_a`, `workload_b`) arrive with `workload_valid` = 1
 2. Computation performed (Int8 addition, wrapping on overflow)
 3. Output gating: each result bit ANDed with `enabled` signal
    - If `allowance > 0`: `enabled` = 1, result passes through
@@ -210,43 +234,39 @@ The paper's Section 4 discusses attack vectors against these assumptions in deta
 
 | Signal | Width | Description |
 |--------|-------|-------------|
-| `clock` | 1 | System clock |
-| `clear` | 1 | Synchronous reset (active high) |
-| `license_submit` | 1 | Pulse high for one cycle to submit license |
+| `clk` | 1 | System clock |
+| `rst_n` | 1 | Asynchronous reset (active low) |
+| `license_valid` | 1 | License submission request (hold until `license_ready`) |
 | `license_r` | 256 | ECDSA signature r component |
 | `license_s` | 256 | ECDSA signature s component |
 | `workload_valid` | 1 | Workload input data valid |
-| `int8_a` | 8 | Signed 8-bit operand A |
-| `int8_b` | 8 | Signed 8-bit operand B |
-| `param_a` | 256 | ECDSA curve parameter a (0 for secp256k1) |
-| `param_b3` | 256 | ECDSA curve parameter 3b (21 for secp256k1) |
-| `trng_seed` | 256 | Seed value for TRNG (testing only) |
+| `workload_a` | 8 | Workload operand A |
+| `workload_b` | 8 | Workload operand B |
 | `trng_load_seed` | 1 | Load seed into TRNG (testing only) |
+| `trng_seed` | 256 | Seed value for TRNG (testing only) |
 
 ### Top-Level Outputs
 
 | Signal | Width | Description |
 |--------|-------|-------------|
+| `license_ready` | 1 | License verification complete (pulse) |
 | `nonce` | 256 | Current nonce value |
 | `nonce_ready` | 1 | Nonce is stable and ready for signing |
-| `int8_result` | 8 | Gated workload output |
+| `workload_result` | 8 | Gated workload output |
 | `result_valid` | 1 | Result output is valid |
 | `allowance` | 64 | Current allowance counter value |
 | `enabled` | 1 | Allowance > 0 |
-| `state_debug` | 4 | Current state machine state (debug) |
-| `licenses_accepted` | 16 | Count of valid licenses processed (debug) |
-| `ecdsa_busy` | 1 | ECDSA verification in progress (debug) |
 
 ### TRNG Submodule Interface
 
 | Direction | Signal | Width | Description |
 |-----------|--------|-------|-------------|
-| Input | `clock` | 1 | System clock |
-| Input | `clear` | 1 | Synchronous reset |
+| Input | `clk` | 1 | System clock |
+| Input | `rst_n` | 1 | Asynchronous reset (active low) |
 | Input | `enable` | 1 | Enable entropy counter |
 | Input | `request_new` | 1 | Pulse to latch new nonce |
-| Input | `seed` | 256 | Seed value (testing only) |
 | Input | `load_seed` | 1 | Load seed (testing only) |
+| Input | `seed` | 256 | Seed value (testing only) |
 | Output | `nonce` | 256 | Latched nonce value |
 | Output | `nonce_valid` | 1 | Nonce has been latched |
 
@@ -254,17 +274,14 @@ The paper's Section 4 discusses attack vectors against these assumptions in deta
 
 | Direction | Signal | Width | Description |
 |-----------|--------|-------|-------------|
-| Input | `clock` | 1 | System clock |
-| Input | `clear` | 1 | Synchronous reset |
-| Input | `start` | 1 | Pulse to begin verification |
+| Input | `clk` | 1 | System clock |
+| Input | `rst_n` | 1 | Asynchronous reset (active low) |
+| Input | `valid` | 1 | Start verification (hold until `ready`) |
 | Input | `z` | 256 | Message hash (= nonce) |
 | Input | `r` | 256 | Signature r component |
 | Input | `s` | 256 | Signature s component |
-| Input | `param_a` | 256 | Curve parameter a |
-| Input | `param_b3` | 256 | Curve parameter 3b |
-| Output | `done_` | 1 | Verification complete (pulse) |
-| Output | `valid` | 1 | Signature is valid |
-| Output | `busy` | 1 | Verification in progress |
+| Output | `ready` | 1 | Verification complete (pulse) |
+| Output | `verif_passed` | 1 | Signature is valid |
 
 ---
 
@@ -274,32 +291,24 @@ The paper's Section 4 discusses attack vectors against these assumptions in deta
 
 ```mermaid
 stateDiagram-v2
-    [*] --> Init_delay
-    Init_delay --> Request_nonce: counter ≥ 100
-    Request_nonce --> Wait_nonce: immediate
-    Wait_nonce --> Publish: nonce_valid
-    Publish --> Verify_start: license_submit
-    Verify_start --> Verify_wait: !ecdsa.busy
-    Verify_wait --> Update: ecdsa.done_
-    Update --> Request_nonce: valid
-    Update --> Publish: invalid
+    [*] --> StInitDelay
+    StInitDelay --> StRequestNonce: counter ≥ 100
+    StRequestNonce --> StWaitNonce: immediate
+    StWaitNonce --> StPublishAndWait: nonce_valid
+    StPublishAndWait --> StWaitVerify: license_valid
+    StWaitVerify --> StRequestNonce: verif passed
+    StWaitVerify --> StPublishAndWait: verif failed
 ```
 
 ### State Descriptions
 
 | State | Entry Condition | Actions | Exit Condition |
 |-------|-----------------|---------|----------------|
-| `Init_delay` | Reset | Increment delay counter | Counter ≥ 100 |
-| `Request_nonce` | From Init_delay or Update (valid) | Assert `request_new` to TRNG | Immediate |
-| `Wait_nonce` | From Request_nonce | Wait for TRNG | `nonce_valid` |
-| `Publish` | From Wait_nonce or Update (invalid) | Latch nonce; `nonce_ready` = 1 | `license_submit` |
-| `Verify_start` | From Publish | Latch r, s; assert `ecdsa_start` | `!ecdsa.busy` |
-| `Verify_wait` | From Verify_start | Wait for ECDSA | `ecdsa.done_` |
-| `Update` | From Verify_wait | If valid: increment allowance | Immediate |
-
----
-
-Here's an expanded section on the ECDSA and modular arithmetic architecture to add to the README:
+| `StInitDelay` | Reset | Increment delay counter | Counter ≥ 100 |
+| `StRequestNonce` | From StInitDelay or StWaitVerify (valid) | Pulse `request_new` to TRNG | Immediate |
+| `StWaitNonce` | From StRequestNonce | Wait for TRNG | `nonce_valid` |
+| `StPublishAndWait` | From StWaitNonce or StWaitVerify (invalid) | `nonce_ready` = 1; wait for license | `license_valid` |
+| `StWaitVerify` | From StPublishAndWait | Wait for ECDSA; if valid: increment allowance | `ecdsa_ready` |
 
 ---
 
@@ -315,21 +324,18 @@ flowchart TB
 
             subgraph SM["State Machine"]
                 direction TB
-                SM_PREP["Prep Phase<br/>u1, u2 computation"]
+                SM_PREP["Prepare<br/>u1, u2 computation"]
                 SM_LOOP["Scalar Mult Loop<br/>256 iterations"]
-                SM_FIN["Finalize<br/>projective to affine"]
-                SM_CMP["Compare<br/>x_affine == r ?"]
+                SM_FIN["Finalize<br/>projective → affine<br/>compare x == r"]
 
                 SM_PREP --> SM_LOOP
                 SM_LOOP --> SM_FIN
-                SM_FIN --> SM_CMP
             end
 
-            subgraph REGS["Register File --- 17 x 256-bit"]
+            subgraph REGS["Register File --- 15 x 256-bit"]
                 direction LR
                 R_PT["Point Coords<br/>X1 Y1 Z1<br/>X2 Y2 Z2<br/>X3 Y3 Z3"]
                 R_TMP["Temps<br/>t0 - t5"]
-                R_PRM["Params<br/>a, b3"]
             end
         end
 
@@ -339,19 +345,17 @@ flowchart TB
             subgraph ARITH["Modular Arithmetic Unit"]
                 direction TB
 
-                subgraph INV["Inverse<br/>Ext Euclidean"]
+                subgraph INV["Inverse<br/>Binary Ext GCD"]
                     direction TB
                 end
 
-                subgraph MUL["Multiply<br/>shift-and-add"]
+                subgraph MUL["Multiply<br/>Shift-and-Add"]
                     direction TB
                 end
 
                 subgraph ADDSUB["Add - Sub"]
                     direction TB
-                    MOD["Modulus Select<br/>prime p or order n"]
                     ADD256["256-bit Adder"]
-                    MOD --> ADD256
                 end
 
                 INV --> ADDSUB
@@ -359,12 +363,12 @@ flowchart TB
             end
         end
 
-        SM <-->|"start, op<br/>done"| ARITH
+        SM <-->|"valid, op<br/>ready"| ARITH
         REGS <-->|"read A B<br/>write result"| ARITH
     end
 
     EXT_IN["Inputs:<br/>z, r, s"] --> ECDSA
-    ECDSA --> EXT_OUT["Output:<br/>valid"]
+    ECDSA --> EXT_OUT["Output:<br/>verif_passed"]
 
     classDef outer fill:#f0f7ff,stroke:#2563eb,stroke-width:2px,color:#1e40af
     classDef arithbox fill:#fef9e7,stroke:#b7950b,stroke-width:2px,color:#7d6608
@@ -379,11 +383,11 @@ flowchart TB
     class ECDSA outer
     class ARITH arithbox
     class SM smbox
-    class SM_PREP,SM_LOOP,SM_FIN,SM_CMP smnode
+    class SM_PREP,SM_LOOP,SM_FIN smnode
     class REGS regsbox
     class R_PT,R_TMP,R_PRM regsnode
     class INV,MUL,ADDSUB subunit
-    class shared,MOD,ADD256 sharedbox
+    class shared,ADD256 sharedbox
     class EXT_IN,EXT_OUT external
 ```
 
@@ -414,7 +418,7 @@ Computing `u₁·G + u₂·Q` naively would require two separate scalar multipli
 For each bit position `i` from 255 down to 0:
 1. **Double** the accumulator point `P`
 2. **Add** a precomputed point based on the bit pair `(u₁[i], u₂[i])`:
-   - `(0,0)`: add nothing (skip)
+   - `(0,0)`: add nothing
    - `(1,0)`: add `G`
    - `(0,1)`: add `Q`
    - `(1,1)`: add `G+Q` (precomputed)
@@ -428,18 +432,19 @@ Point addition uses the complete addition formulas from Renes, Costello, and Bat
 - Avoid branching on point values, which simplifies the state machine and improves side-channel resistance
 - Require only field operations (add, subtract, multiply) with no inversions during the main loop
 
-Each point addition/doubling executes a fixed sequence of 40 field operations, implemented as a microcode program:
+Each point addition/doubling executes a fixed sequence of 40 field operations, implemented as a microcode ROM:
 
-```ocaml
-let program = [|
-  { op = Op.mul; src1 = Config.x1; src2 = Config.x2; dst = Config.t0 };  (* t0 = X1·X2 *)
-  { op = Op.mul; src1 = Config.y1; src2 = Config.y2; dst = Config.t1 };  (* t1 = Y1·Y2 *)
-  { op = Op.mul; src1 = Config.z1; src2 = Config.z2; dst = Config.t2 };  (* t2 = Z1·Z2 *)
-  (* ... 37 more operations ... *)
-|]
+```systemverilog
+localparam instr_t PROGRAM [ROM_SIZE] = '{
+    // ... Point addition (Renes-Costello-Batina, 40 steps) ...
+    '{op: OP_MUL, src1: X1, src2: X2, dst: T0},   // t0 = X1·X2
+    '{op: OP_MUL, src1: Y1, src2: Y2, dst: T1},   // t1 = Y1·Y2
+    '{op: OP_MUL, src1: Z1, src2: Z2, dst: T2},   // t2 = Z1·Z2
+    // ... 37 more operations ...
+};
 ```
 
-The formula uses 6 temporary registers (`t0`–`t5`) plus input/output point coordinates and curve parameters, for a total of 17 registers.
+The formula uses 6 temporary registers (`t0`–`t5`) plus input/output point coordinates (`X1`–`Z3`), for a total of 15 registers. Curve constants `a` and `3b` are addressed as pseudo-registers but are hardcoded, not stored.
 
 ### Modular Arithmetic Unit
 
@@ -449,41 +454,36 @@ The `Arith` module provides the four operations needed for elliptic curve arithm
 |-----------|-------------|-----------|
 | `add` | `(a + b) mod m` | Add with conditional subtraction |
 | `sub` | `(a - b) mod m` | Subtract with conditional addition |
-| `mul` | `(a · b) mod m` | Montgomery multiplication (256 iterations) |
-| `inv` | `a⁻¹ mod m` | Extended Euclidean algorithm |
+| `mul` | `(a · b) mod m` | Binary shift-and-add (256 iterations) |
+| `inv` | `a⁻¹ mod m` | Binary Extended GCD |
 
 All operations work over 256-bit operands and can use either the field prime `p` or curve order `n` as the modulus:
 - Point arithmetic (during scalar multiplication) uses `mod p`
 - Scalar preparation (`u₁`, `u₂` computation) and final comparison use `mod n`
 
-The arithmetic unit interfaces with a 17-register file. Operations are started with a pulse and signal completion via `done_`. Typical cycle counts:
+The arithmetic unit interfaces with the register file. Operations are started by asserting `valid` and signal completion via `ready`. Typical cycle counts:
 - Add/Sub: 2–3 cycles
-- Mul: ~500-1000 cycles (bit-serial, varies with y input)
+- Mul: ~500-1000 cycles (bit-serial, varies with b input)
 - Inv: ~2000–3000 cycles (varies with input)
 
 ### State Machine Overview
 
 The ECDSA verification state machine proceeds through these phases:
 
-```
-Idle → Prep_op → Loop ⟷ Load → Run_add → Finalize_op → Compare → Done
-         ↑__________________|
-```
-
-**Prep_op** (3 operations, using `mod n`):
+**StPrepare** (3 operations, using `mod n`):
 1. `w = s⁻¹ mod n`
 2. `u₁ = z · w mod n`
 3. `u₂ = r · w mod n`
 
-**Loop/Load/Run_add** (256 bit positions × ~40 ops each):
-- For each bit position, double the accumulator and conditionally add `G`, `Q`, or `G+Q`
+**StAdd/StDouble** (2 × 256 bit positions × 40 ops each):
+- For each bit position (MSB to LSB), add a selected point then double the accumulator
+- Point selection via Shamir's trick: `G`, `Q`, `G+Q`, or infinity based on `(u₁[i], u₂[i])`
 - Point at infinity handled via projective coordinates (`Z = 0`)
 
-**Finalize_op** (2 operations, using `mod p`):
+**StFinalize** (3 operations, using `mod p`):
 1. `z_inv = Z⁻¹ mod p` (convert from projective to affine)
 2. `x_affine = X · z_inv mod p`
-
-**Compare**: Check if `x_affine == r`
+3. `diff = x_affine - r mod p` (valid if `diff == 0`)
 
 ### Cycle Count
 
@@ -491,13 +491,14 @@ Total verification takes approximately 5 million cycles, dominated by the ~256 p
 
 ### Hardcoded Constants
 
-The prototype hardcodes:
+The prototype hardcodes the following secp256k1 constants:
 - Generator point `G` (from secp256k1 specification)
-- Public key `Q = 2G` (would be chip-specific in production)
-- Precomputed sum `G + Q = 3G`
+- Public key `Q = d · G`, where d is the Private Key (using 2G for testing, would be chip-specific in production)
+- Precomputed sum `GPQ = G + Q`
 - Point at infinity `(0, 1, 0)` in projective coordinates
-- Field prime `p = 2²⁵⁶ - 2³² - 977`
-- Curve order `n = 2²⁵⁶ - 432420386565659656852420866394968145599`
+- Field prime `p = 2²⁵⁶ - 2³² - 977` (from secp256k1 specification)
+- Curve order `n = 2²⁵⁶ - 432420386565659656852420866394968145599` (from secp256k1 specification)
+- Curve parameters `a = 0`, `b = 7` (y² = x³ + ax + b, from secp256k1 specification)
 
 In production, `Q` would be unique per chip (or per batch) and stored in Mask ROM, as recommended in the paper. The other constants are fixed by the secp256k1 specification.
 
@@ -521,11 +522,11 @@ This implementation omits several features needed for production:
 
 | Operation | Cycles | Notes |
 |-----------|--------|-------|
-| Initialization delay | 100 | Configurable via `Config.init_delay_cycles` |
+| Initialization delay | 100 | Configurable via `INIT_DELAY_CYCLES` |
 | Nonce generation | 2 | Request + latch |
-| License verification | ~10⁶ | ECDSA scalar multiplication dominates |
+| License verification | ~5×10⁶ | ECDSA scalar multiplication dominates |
 | Workload operation | 1 | Combinational add + output register |
-| Allowance per license | 10¹² | Configurable via `Config.allowance_increment` |
+| Allowance per license | 10¹² | Configurable via `ALLOWANCE_INCREMENT` |
 
 ### Allowance Calculation
 
@@ -597,26 +598,27 @@ This is a proof-of-concept implementation. The paper discusses broader limitatio
 
 ## Configuration Parameters
 
-```ocaml
-module Config = struct
-  let nonce_width = 256
-  let signature_width = 256
-  let allowance_width = 64
-  let init_delay_cycles = 100
-  let allowance_increment = 1_000_000_000_000  (* ~17 min at 1GHz *)
-end
+```systemverilog
+// arith_pkg.sv
+localparam int WIDTH = 256;              // nonce, signature, and field element width
+
+// security_block.sv
+localparam int unsigned ALLOW_W            = 64;                    // allowance counter width
+localparam int          INIT_DELAY_CYCLES  = 100;                   // cycles before first nonce
+localparam logic [ALLOW_W-1:0] ALLOWANCE_INCREMENT = 64'd1_000_000_000_000;  // ~17 min at 1 GHz
 ```
 
 | Parameter | Value | Description |
 |-----------|-------|-------------|
-| `nonce_width` | 256 | Width of nonce in bits (matches ECDSA message size) |
-| `signature_width` | 256 | Width of signature components r and s |
-| `allowance_width` | 64 | Width of allowance counter (supports ~584 years at 1 GHz) |
-| `init_delay_cycles` | 100 | Cycles to wait after reset before requesting first nonce |
-| `allowance_increment` | 10¹² | Cycles added to allowance per valid license (~17 min at 1 GHz) |
+| `WIDTH` | 256 | Width of nonce, signature components, and field elements |
+| `ALLOW_W` | 64 | Width of allowance counter (supports ~584 years at 1 GHz) |
+| `INIT_DELAY_CYCLES` | 100 | Cycles to wait after reset before requesting first nonce |
+| `ALLOWANCE_INCREMENT` | 10¹² | Cycles added to allowance per valid license (~17 min at 1 GHz) |
 
 ---
 
 ## References
 
 Petrie, J. (2025). Embedded Off-Switches for AI Compute. *arXiv preprint* arXiv:2509.07637. https://arxiv.org/abs/2509.07637
+
+[1]: https://www.secg.org/sec2-v2.pdf
diff --git a/verilog/Makefile b/verilog/Makefile
new file mode 100644
index 0000000..f651765
--- /dev/null
+++ b/verilog/Makefile
@@ -0,0 +1,91 @@
+# ---------------------------------------------------------------------------
+# Makefile configuration — edit these for your run
+# ---------------------------------------------------------------------------
+
+TOOL ?= verilator
+TB   ?= top
+GUI  ?= 0
+
+# ---------------------------------------------------------------------------
+# Constant variables
+# ---------------------------------------------------------------------------
+
+RTL_VC     := rtl/design.vc
+TB_DIR     := tb
+TB_FILE    := $(TB_DIR)/tb_$(TB).sv
+BUILD_DIR  := build
+TOP_MODULE := tb
+
+# ---------------------------------------------------------------------------
+# Simulation
+# ---------------------------------------------------------------------------
+
+VFLAGS := -cc --exe --build -j --timing
+ifeq ($(GUI),1)
+	# using FST instead of VCD as the ECDSA simulation dump is in GBs range
+	# FST is 5-20x smaller in general
+	VFLAGS += --trace-fst
+endif
+
+.PHONY: sim
+sim:
+ifeq ($(TOOL),verilator)
+	@echo "-- VERILATE & BUILD --------"
+	verilator $(VFLAGS) -F $(RTL_VC) +incdir+$(TB_DIR) $(TB_FILE) --top-module $(TOP_MODULE) tb/sim_main.cpp --Mdir $(BUILD_DIR)
+	@echo "-- RUN ---------------------"
+	$(BUILD_DIR)/V$(TOP_MODULE)
+	@echo "-- DONE --------------------"
+ifeq ($(GUI),1)
+	@echo "-- OPENING GUI -------------"
+	gtkwave dump.fst
+endif
+else
+	$(error Unknown sim tool '$(TOOL)'. See 'make help' for valid options)
+endif
+
+# ---------------------------------------------------------------------------
+# Lint
+# ---------------------------------------------------------------------------
+# ALL_CAPS | CamelCase
+VERIBLE_RULES_FLAG := --rules="parameter-name-style=localparam_style_regex:([A-Z][A-Z0-9]*(_[A-Z0-9]+)*|([A-Z][a-z0-9]*)+(_[0-9]+)?)"
+
+.PHONY: lint
+lint:
+ifeq ($(GUI),1)
+	$(error Lint GUI '$(GUI)' not supported. See 'make help' for valid options)
+endif
+ifeq ($(TOOL),verilator)
+	@echo "-- RUN VERILATOR LINT -------"
+	verilator --lint-only -Wall -F $(RTL_VC)
+	@echo "-- DONE --------------------"
+else ifeq ($(TOOL),verible)
+	@echo "-- RUN VERIBLE LINT --------"
+	cd $(dir $(RTL_VC)) &&                          \
+	verible-verilog-lint $(VERIBLE_RULES_FLAG)  $(shell cat $(RTL_VC)) && \
+	cd -
+	@echo "-- DONE --------------------"
+else
+	$(error Unknown lint tool '$(TOOL)'. See 'make help' for valid options)
+endif
+
+
+
+# ---------------------------------------------------------------------------
+# Housekeeping
+# ---------------------------------------------------------------------------
+
+.PHONY: clean
+clean:
+	rm -rf $(BUILD_DIR) *.vcd *.fst *.vpd *.wlf *.log simv csrc
+
+.PHONY: help
+help:
+	@echo ""
+	@echo "RTL Makefile — available targets"
+	@echo "────────────────────────────────────────────────────────"
+	@echo "  sim      Run simulation (TOOL=verilator TB=top|ecdsa|arith GUI=0|1)"
+	@echo "  lint     Run linter (TOOL=verilator|verible GUI=0)"
+	@echo "  clean    Remove all generated build artefacts"
+	@echo "  help     Print this message"
+
+.DEFAULT_GOAL := help
diff --git a/verilog/rtl/arith.sv b/verilog/rtl/arith.sv
new file mode 100644
index 0000000..c6a8e6c
--- /dev/null
+++ b/verilog/rtl/arith.sv
@@ -0,0 +1,199 @@
+// Arith - Modular arithmetic unit for secp256k1 field operations
+//
+// Performs add, sub, mul, inv modulo either field prime p or curve order n.
+// Operands are read from and results written to an external register file.
+//
+// Operations (op input):
+//   0 = add: f <- a + b  mod m
+//   1 = sub: f <- a - b  mod m
+//   2 = mul: f <- a * b  mod m
+//   3 = inv: f <- a^(-1) mod m  (b ignored)
+//
+// Protocol:
+//   1. Set a, b, modulus, op; pulse valid high and hold until ready
+//   2. ready pulses high for one cycle when the result is available
+
+module arith
+    import arith_pkg::*; // import in module header to be used in port list
+(
+    input  logic             clk,
+    input  logic             rst_n,
+    input  logic             valid,
+    input  op_e              op,
+    input  logic [WIDTH-1:0] a,
+    input  logic [WIDTH-1:0] b,
+    input  logic [WIDTH-1:0] modulus,
+
+    output logic             ready,
+    output logic [WIDTH-1:0] result
+);
+
+    // ---------------------------------------------------------------------------
+    // Shared mod_add — instance
+    // ---------------------------------------------------------------------------
+
+    // Inputs shared across multiple blocks, assigned after each block declared
+    logic             mod_add_valid;
+    logic [WIDTH-1:0] mod_add_a;
+    logic [WIDTH-1:0] mod_add_b;
+    logic             mod_add_subtract;
+
+    logic             mod_add_ready;
+    logic [WIDTH-1:0] mod_add_result;
+    logic             mod_add_adjust;
+
+    mod_add u_mod_add (
+        .clk      (clk),
+        .rst_n    (rst_n),
+        .valid    (mod_add_valid),
+        .a        (mod_add_a),
+        .b        (mod_add_b),
+        .modulus  (modulus),
+        .subtract (mod_add_subtract),
+        .ready    (mod_add_ready),
+        .result   (mod_add_result),
+        .adjust   (mod_add_adjust)
+    );
+
+    // ---------------------------------------------------------------------------
+    // mod_mul instance
+    // ---------------------------------------------------------------------------
+
+    // Input glue logic
+    wire mod_mul_valid = valid && (op == OP_MUL);
+
+    // Output nets
+    logic             mod_mul_ready;
+    logic [WIDTH-1:0] mod_mul_result;
+
+    // mod_add interface
+    logic             mod_mul_add_valid;
+    logic [WIDTH-1:0] mod_mul_add_a;
+    logic [WIDTH-1:0] mod_mul_add_b;
+    logic             mod_mul_add_subtract;
+
+    // mod_add resp glue logic
+    wire              mod_mul_add_ready  = mod_mul_add_valid && mod_add_ready;
+    wire [WIDTH-1:0]  mod_mul_add_result = mod_add_result;
+
+    mod_mul u_mod_mul (
+        .clk             (clk),
+        .rst_n           (rst_n),
+        .valid           (mod_mul_valid),
+        .a               (a),
+        .b               (b),
+        .mod_add_ready   (mod_mul_add_ready),
+        .mod_add_result  (mod_mul_add_result),
+        .ready           (mod_mul_ready),
+        .result          (mod_mul_result),
+        .mod_add_valid   (mod_mul_add_valid),
+        .mod_add_a       (mod_mul_add_a),
+        .mod_add_b       (mod_mul_add_b),
+        .mod_add_subtract(mod_mul_add_subtract)
+    );
+
+    // ---------------------------------------------------------------------------
+    // mod_inv instance
+    // ---------------------------------------------------------------------------
+
+    // Input glue logic
+    wire mod_inv_valid = valid && (op == OP_INV);
+
+    // Output nets
+    logic             mod_inv_ready;
+    logic             mod_inv_exists_unused;    // not used currently at arith level
+    logic [WIDTH-1:0] mod_inv_result;
+
+    // mod_add interface
+    logic             mod_inv_add_valid;
+    logic [WIDTH-1:0] mod_inv_add_a;
+    logic [WIDTH-1:0] mod_inv_add_b;
+    logic             mod_inv_add_subtract;
+
+    // mod_add resp glue logic
+    wire              mod_inv_add_ready  = mod_inv_add_valid && mod_add_ready;
+    wire [WIDTH-1:0]  mod_inv_add_result = mod_add_result;
+    wire              mod_inv_add_adjust = mod_add_adjust;
+
+    mod_inv u_mod_inv (
+        .clk             (clk),
+        .rst_n           (rst_n),
+        .valid           (mod_inv_valid),
+        .a               (a),
+        .modulus         (modulus),
+        .mod_add_ready   (mod_inv_add_ready),
+        .mod_add_result  (mod_inv_add_result),
+        .mod_add_adjust  (mod_inv_add_adjust),
+        .ready           (mod_inv_ready),
+        .exists          (mod_inv_exists_unused),
+        .result          (mod_inv_result),
+        .mod_add_valid   (mod_inv_add_valid),
+        .mod_add_a       (mod_inv_add_a),
+        .mod_add_b       (mod_inv_add_b),
+        .mod_add_subtract(mod_inv_add_subtract)
+    );
+
+    // ---------------------------------------------------------------------------
+    // mod_add input assignments
+    // ---------------------------------------------------------------------------
+
+    always_comb begin
+        mod_add_valid    = 1'b0;
+        mod_add_a        = '0;
+        mod_add_b        = '0;
+        mod_add_subtract = 1'b0;
+
+        unique case(op)
+            OP_ADD: begin
+                mod_add_valid    = valid;
+                mod_add_a        = a;
+                mod_add_b        = b;
+                mod_add_subtract = 1'b0;
+            end
+            OP_SUB: begin
+                mod_add_valid    = valid;
+                mod_add_a        = a;
+                mod_add_b        = b;
+                mod_add_subtract = 1'b1;
+            end
+            OP_MUL: begin
+                mod_add_valid    = mod_mul_add_valid;
+                mod_add_a        = mod_mul_add_a;
+                mod_add_b        = mod_mul_add_b;
+                mod_add_subtract = mod_mul_add_subtract;
+            end
+            OP_INV: begin
+                mod_add_valid    = mod_inv_add_valid;
+                mod_add_a        = mod_inv_add_a;
+                mod_add_b        = mod_inv_add_b;
+                mod_add_subtract = mod_inv_add_subtract;
+            end
+        endcase
+    end
+
+    // ---------------------------------------------------------------------------
+    // Output assignments
+    // ---------------------------------------------------------------------------
+
+    always_comb begin
+        ready  = 1'b0;
+        result = '0;
+
+        unique case(op)
+            OP_ADD,
+            OP_SUB: begin
+                ready  = mod_add_ready;
+                result = mod_add_result;
+            end
+            OP_MUL: begin
+                ready  = mod_mul_ready;
+                result = mod_mul_result;
+            end
+            OP_INV: begin
+                ready  = mod_inv_ready;
+                result = mod_inv_result;
+            end
+        endcase
+    end
+
+endmodule
diff --git a/verilog/rtl/arith_pkg.sv b/verilog/rtl/arith_pkg.sv
new file mode 100644
index 0000000..14de35c
--- /dev/null
+++ b/verilog/rtl/arith_pkg.sv
@@ -0,0 +1,12 @@
+package arith_pkg;
+
+    parameter int unsigned WIDTH = 256;
+
+    typedef enum logic [1:0] {
+        OP_ADD, // modular addition:        a + b mod p
+        OP_SUB, // modular subtraction:     a - b mod p
+        OP_MUL, // modular multiplication:  a * b mod p
+        OP_INV  // modular inverse:         a^-1  mod p (b ignored)
+    } op_e;
+
+endpackage
diff --git a/verilog/rtl/comb_add.sv b/verilog/rtl/comb_add.sv
new file mode 100644
index 0000000..8e4823d
--- /dev/null
+++ b/verilog/rtl/comb_add.sv
@@ -0,0 +1,26 @@
+module comb_add
+    import arith_pkg::*; // import in module header to be used in port list
+(
+    input  wire [WIDTH-1:0] a,
+    input  wire [WIDTH-1:0] b,
+    input  wire             subtract,
+    output wire [WIDTH-1:0] result,
+    output wire             carry_out
+);
+
+    // Two's complement negation of b:
+    //   Step 1: bitwise invert b (ones' complement)
+    //   Step 2: add 1 via carry-in (subtract fed as cin below)
+    // Together these form -(b) in two's complement.
+    // When subtract=0 the uninverted b and cin=0 pass through unchanged.
+    wire [WIDTH:0] b_ext = {1'b0, b};
+    wire [WIDTH:0] b_eff = subtract ? ~b_ext : b_ext;
+
+    // Single (WIDTH+1)-bit full adder with carry-in.
+    // The third operand is a single bit (the carry-in),
+    // which synthesis tools map directly to the adder's carry-in port.
+    wire [WIDTH:0] sum = {1'b0, a} + b_eff + { {WIDTH{1'b0}}, subtract};
+
+    assign result    = sum[WIDTH-1:0];
+    assign carry_out = sum[WIDTH];
+endmodule
diff --git a/verilog/rtl/design.vc b/verilog/rtl/design.vc
new file mode 100644
index 0000000..315299d
--- /dev/null
+++ b/verilog/rtl/design.vc
@@ -0,0 +1,10 @@
+./arith_pkg.sv
+./comb_add.sv
+./mod_add.sv
+./mod_mul.sv
+./mod_inv.sv
+./arith.sv
+./trng.sv
+./secp256k1_pkg.sv
+./ecdsa.sv
+./security_block.sv
diff --git a/verilog/rtl/ecdsa.sv b/verilog/rtl/ecdsa.sv
new file mode 100644
index 0000000..74098b3
--- /dev/null
+++ b/verilog/rtl/ecdsa.sv
@@ -0,0 +1,509 @@
+// ECDSA - Signature verification for secp256k1
+//
+// Verifies ECDSA signatures using:
+//   R = u1*G + u2*Q
+// where:
+//   u1 = z * s^(-1) mod n
+//   u2 = r * s^(-1) mod n
+//
+// Signature is valid if R.x mod n == r
+//
+// Uses Renes-Costello-Batina complete addition formula in projective coordinates.
+// Uses Shamir's trick for simultaneous scalar multiplication (processes u1/u2
+// bits in parallel, selecting G/Q/G+Q/infinity per iteration).
+//
+// Hardcoded: G (generator), Q = 2G (public key), G+Q = 3G (precomputed sum)
+//
+// Protocol:
+//   1. Assert valid and hold z, r, s stable until ready pulses
+//   2. ready pulses high for one cycle when verification completes
+//   3. When ready, check verif_passed: 1 = signature verification passed, 0 = signature verification failed
+//
+// FSM:
+//
+//   StIdle -> StPrepare -> StAdd -> StDouble -> StAdd -> StFinalize -> StIdle
+//                                      ^          |
+//                                      |__________|
+//
+// Note: could do the same skip StAdd optimization as in mod_mul but
+// PC loading does not currently support re-running the same state (StDouble after StDouble)
+
+
+module ecdsa
+    import arith_pkg::*; // import in module header to be used in port list
+    import secp256k1_pkg::*;
+(
+    input  logic             clk,
+    input  logic             rst_n,
+    input  logic             valid,
+    input  logic [WIDTH-1:0] z,
+    input  logic [WIDTH-1:0] r,
+    input  logic [WIDTH-1:0] s,
+
+    output logic             ready,
+    output logic             verif_passed
+);
+
+    // -------------------------------------------------------------------------
+    // Types and Constants
+    // -------------------------------------------------------------------------
+
+    typedef logic [4:0] all_addr_t;
+
+    // Register file indices
+    typedef enum all_addr_t {
+        T0, T1, T2, T3, T4, T5,
+        X3, Y3, Z3,
+        X1, Y1, Z1,
+        X2, Y2, Z2,
+        A1, B3,   // constants, not actual registers
+        NUM_ADDRS // last element to contain the total number of addresses
+    } all_addr_e;
+
+    localparam int NUM_CONSTS = 2;
+    localparam int NUM_REGS   = int'(NUM_ADDRS) - NUM_CONSTS; // number of actual registers
+
+    typedef logic [$clog2(NUM_REGS)-1:0] reg_addr_t;
+
+    localparam int BITCNT_W = $clog2(WIDTH); // Bit Counter Width
+
+    // Public key Q (derived from G and Private key d)
+    localparam logic [WIDTH-1:0]
+    Q_X = 256'hc6047f9441ed7d6d3045406e95c07cd85c778e4b8cef3ca7abac09b95c709ee5,
+    Q_Y = 256'h1ae168fea63dc339a3c58419466ceaeef7f632653266d0e1236431a950cfe52a,
+    Q_Z = 1;
+
+    // Precomputed G + Q (assumed to be computed together with Q, so not implementing the addition here)
+    localparam logic [WIDTH-1:0]
+    GPQ_X = 256'hf9308a019258c31049344f85f89d5229b531c845836f99b08601f113bce036f9,
+    GPQ_Y = 256'h388f7b0f632de8140fe337e62a37f3566500a99934c2231b6cb9fd7584b8e672,
+    GPQ_Z = 1;
+
+    // Point at infinity (z = 0)
+    localparam logic [WIDTH-1:0]
+    INF_X = 0,
+    INF_Y = 1,
+    INF_Z = 0;
+
+    // -------------------------------------------------------------------------
+    // Instruction ROM
+    // -------------------------------------------------------------------------
+
+    typedef struct packed {
+        op_e        op;
+        all_addr_t  src1;
+        all_addr_t  src2;
+        all_addr_t  dst;
+        // Note: dst can only be register (not constant) so reg_addr_t could also work,
+        //       but the reg enums are using all_addr_t, so using that avoids casting
+    } instr_t;
+
+    // Segment lengths and PC width
+    // Note: tried assigning the programs to separate arrays to qurry lengths
+    //       but verilator had issues with concatenating those
+    localparam int PREPARE_LEN   = 3;
+    localparam int POINT_ADD_LEN = 40;
+    localparam int FINALIZE_LEN  = 3;
+    localparam int ROM_SIZE      = PREPARE_LEN + POINT_ADD_LEN + FINALIZE_LEN;
+    localparam int PC_WIDTH      = $clog2(ROM_SIZE);
+
+    typedef logic [PC_WIDTH-1:0] pc_t;
+
+    localparam instr_t PROGRAM [ROM_SIZE] = '{
+
+        // --- Prepare (mod n) ---
+        // w = s^(-1) mod n;  u1 = z*w mod n;  u2 = r*w mod n
+        // Assumes t0=s, t1=z, t2=r
+        /* 1 */ '{op: OP_INV, src1: T0, src2: T0, dst: T0},   // t0 = inv(t0)
+        /* 2 */ '{op: OP_MUL, src1: T1, src2: T0, dst: T1},   // t1 = t1 * t0
+        /* 3 */ '{op: OP_MUL, src1: T2, src2: T0, dst: T2},   // t2 = t2 * t0
+
+        // --- Point addition (Renes-Costello-Batina, 40 steps) ---
+        /*  1 */ '{op: OP_MUL, src1: X1, src2: X2, dst: T0},   // t0 = x1*x2
+        /*  2 */ '{op: OP_MUL, src1: Y1, src2: Y2, dst: T1},   // t1 = y1*y2
+        /*  3 */ '{op: OP_MUL, src1: Z1, src2: Z2, dst: T2},   // t2 = z1*z2
+        /*  4 */ '{op: OP_ADD, src1: X1, src2: Y1, dst: T3},   // t3 = x1+y1
+        /*  5 */ '{op: OP_ADD, src1: X2, src2: Y2, dst: T4},   // t4 = x2+y2
+        /*  6 */ '{op: OP_MUL, src1: T3, src2: T4, dst: T3},   // t3 = t3*t4
+        /*  7 */ '{op: OP_ADD, src1: T0, src2: T1, dst: T4},   // t4 = t0+t1
+        /*  8 */ '{op: OP_SUB, src1: T3, src2: T4, dst: T3},   // t3 = t3-t4
+        /*  9 */ '{op: OP_ADD, src1: X1, src2: Z1, dst: T4},   // t4 = x1+z1
+        /* 10 */ '{op: OP_ADD, src1: X2, src2: Z2, dst: T5},   // t5 = x2+z2
+        /* 11 */ '{op: OP_MUL, src1: T4, src2: T5, dst: T4},   // t4 = t4*t5
+        /* 12 */ '{op: OP_ADD, src1: T0, src2: T2, dst: T5},   // t5 = t0+t2
+        /* 13 */ '{op: OP_SUB, src1: T4, src2: T5, dst: T4},   // t4 = t4-t5
+        /* 14 */ '{op: OP_ADD, src1: Y1, src2: Z1, dst: T5},   // t5 = y1+z1
+        /* 15 */ '{op: OP_ADD, src1: Y2, src2: Z2, dst: X3},   // x3 = y2+z2
+        /* 16 */ '{op: OP_MUL, src1: T5, src2: X3, dst: T5},   // t5 = t5*x3
+        /* 17 */ '{op: OP_ADD, src1: T1, src2: T2, dst: X3},   // x3 = t1+t2
+        /* 18 */ '{op: OP_SUB, src1: T5, src2: X3, dst: T5},   // t5 = t5-x3
+        /* 19 */ '{op: OP_MUL, src1: A1, src2: T4, dst: Z3},   // z3 = a1*t4
+        /* 20 */ '{op: OP_MUL, src1: B3, src2: T2, dst: X3},   // x3 = b3*t2
+        /* 21 */ '{op: OP_ADD, src1: X3, src2: Z3, dst: Z3},   // z3 = x3+z3
+        /* 22 */ '{op: OP_SUB, src1: T1, src2: Z3, dst: X3},   // x3 = t1-z3
+        /* 23 */ '{op: OP_ADD, src1: T1, src2: Z3, dst: Z3},   // z3 = t1+z3
+        /* 24 */ '{op: OP_MUL, src1: X3, src2: Z3, dst: Y3},   // y3 = x3*z3
+        /* 25 */ '{op: OP_ADD, src1: T0, src2: T0, dst: T1},   // t1 = t0+t0
+        /* 26 */ '{op: OP_ADD, src1: T1, src2: T0, dst: T1},   // t1 = t1+t0
+        /* 27 */ '{op: OP_MUL, src1: A1, src2: T2, dst: T2},   // t2 = a1*t2
+        /* 28 */ '{op: OP_MUL, src1: B3, src2: T4, dst: T4},   // t4 = b3*t4
+        /* 29 */ '{op: OP_ADD, src1: T1, src2: T2, dst: T1},   // t1 = t1+t2
+        /* 30 */ '{op: OP_SUB, src1: T0, src2: T2, dst: T2},   // t2 = t0-t2
+        /* 31 */ '{op: OP_MUL, src1: A1, src2: T2, dst: T2},   // t2 = a1*t2
+        /* 32 */ '{op: OP_ADD, src1: T4, src2: T2, dst: T4},   // t4 = t4+t2
+        /* 33 */ '{op: OP_MUL, src1: T1, src2: T4, dst: T0},   // t0 = t1*t4
+        /* 34 */ '{op: OP_ADD, src1: Y3, src2: T0, dst: Y1},   // y1 = y3+t0
+        /* 35 */ '{op: OP_MUL, src1: T5, src2: T4, dst: T0},   // t0 = t5*t4
+        /* 36 */ '{op: OP_MUL, src1: T3, src2: X3, dst: X3},   // x3 = t3*x3
+        /* 37 */ '{op: OP_SUB, src1: X3, src2: T0, dst: X1},   // x1 = x3-t0
+        /* 38 */ '{op: OP_MUL, src1: T3, src2: T1, dst: T0},   // t0 = t3*t1
+        /* 39 */ '{op: OP_MUL, src1: T5, src2: Z3, dst: Z3},   // z3 = t5*z3
+        /* 40 */ '{op: OP_ADD, src1: Z3, src2: T0, dst: Z1},   // z1 = z3+t0
+
+        // --- Finalize (mod p) ---
+        // z_inv = z1^(-1) mod p;  x_affine = x1*z_inv;  result = x_affine - r
+        // Assumes t2=r (restored from r input before entering finalize)
+        /* 1 */ '{op: OP_INV, src1: Z1, src2: Z1, dst: T0},   // t0 = inv(z1)
+        /* 2 */ '{op: OP_MUL, src1: X1, src2: T0, dst: T0},   // t0 = x1*t0
+        /* 3 */ '{op: OP_SUB, src1: T0, src2: T2, dst: T0}    // t0 = t0-t2
+    };
+
+    // Segment boundaries
+    localparam int ROM_START       = 0;
+    localparam int PREPARE_START   = ROM_START;
+    localparam int PREPARE_END     = PREPARE_START   + PREPARE_LEN   - 1;
+    localparam int POINT_ADD_START = PREPARE_END     + 1;
+    localparam int POINT_ADD_END   = POINT_ADD_START + POINT_ADD_LEN - 1;
+    localparam int FINALIZE_START  = POINT_ADD_END   + 1;
+    localparam int FINALIZE_END    = FINALIZE_START  + FINALIZE_LEN  - 1;
+
+    // Array to collect the PC values where execution should automatically stop
+    localparam int PROGRAM_ENDS [3] = '{PREPARE_END, POINT_ADD_END, FINALIZE_END};
+
+    // -------------------------------------------------------------------------
+    // FSM states
+    // -------------------------------------------------------------------------
+
+    typedef enum logic [2:0] {
+        StIdle,
+        StPrepare,
+        StAdd,
+        StDouble,
+        StFinalize
+    } state_e;
+
+    // -------------------------------------------------------------------------
+    // Registers
+    // -------------------------------------------------------------------------
+
+    // FSM state
+    state_e state_q, state_d;
+
+    // Register file
+    logic [WIDTH-1:0] reg_file_q [NUM_REGS];
+    logic [WIDTH-1:0] reg_file_d [NUM_REGS];
+
+    // Other registers
+    pc_t                  pc_q,       pc_d;
+    logic [WIDTH-1:0]     u1_q,       u1_d;
+    logic [WIDTH-1:0]     u2_q,       u2_d;
+    logic [BITCNT_W-1:0]  bit_pos_q,  bit_pos_d;
+
+    // -------------------------------------------------------------------------
+    // Instruction decode
+    // -------------------------------------------------------------------------
+
+    instr_t current_instr;
+    assign current_instr = PROGRAM[pc_q];
+
+    // only Prepare requires PRIME_N
+    wire [WIDTH-1:0] modulus = (int'(pc_q) <= PREPARE_END) ? PRIME_N : PRIME_P;
+
+    // -------------------------------------------------------------------------
+    // Register file access helpers
+    // -------------------------------------------------------------------------
+
+    function automatic logic [WIDTH-1:0] reg_read(input all_addr_t addr);
+        case (addr)
+            // A1 and B3 are constants, not part of the actual register file
+            A1      : return CURVE_A1;
+            B3      : return CURVE_B3;
+            // casting might be needed if the actual register file array requires less bit(s) for indexing
+            default : return reg_file_q[reg_addr_t'(addr)];
+        endcase
+    endfunction
+
+    function automatic void reg_write(input all_addr_t addr, input logic [WIDTH-1:0] val);
+        // Making it explicit to lint that discarding MSB is fine when the widths differ
+        // (The addresses of the constants should not be used for reg_write)
+        if ( addr[$size(all_addr_t)-1] ||
+            !addr[$size(all_addr_t)-1]) begin
+
+            // casting might be needed if the actual register file array requires less bit(s) for indexing
+            reg_file_d[reg_addr_t'(addr)] = val;
+        end
+    endfunction
+
+    // -------------------------------------------------------------------------
+    // Arith instance
+    // -------------------------------------------------------------------------
+
+    // arith block enable register
+    logic arith_valid_q, arith_valid_d;
+
+    // Outputs, used in FSM always_comb
+    logic             arith_ready;   // used to increment the PC and sample arith_result
+    logic [WIDTH-1:0] arith_result;  // stored in current_instr.dst register
+
+    arith u_arith (
+        .clk       (clk),
+        .rst_n     (rst_n),
+        .valid     (arith_valid_q),
+        .op        (current_instr.op),
+        .a         (reg_read(current_instr.src1)),
+        .b         (reg_read(current_instr.src2)),
+        .modulus   (modulus),
+        .ready     (arith_ready),
+        .result    (arith_result)
+    );
+
+    // -------------------------------------------------------------------------
+    // Shamir's trick point selection
+    // -------------------------------------------------------------------------
+
+    logic [WIDTH-1:0] sel_x, sel_y, sel_z;
+
+    always_comb begin
+        // REVISIT - shift register approach to access u1 and u2 bits could be much less gates
+        unique case ({u2_q[bit_pos_q], u1_q[bit_pos_q]})
+            2'b00:   begin sel_x = INF_X; sel_y = INF_Y; sel_z = INF_Z; end
+            2'b01:   begin sel_x = G_X;   sel_y = G_Y;   sel_z = G_Z;   end
+            2'b10:   begin sel_x = Q_X;   sel_y = Q_Y;   sel_z = Q_Z;   end
+            2'b11:   begin sel_x = GPQ_X; sel_y = GPQ_Y; sel_z = GPQ_Z; end
+            default: ;
+        endcase
+    end
+
+    // -------------------------------------------------------------------------
+    // PC — combinational next-state
+    // -------------------------------------------------------------------------
+    always_comb begin
+        // hold by default
+        pc_d           = pc_q;
+
+        if (arith_ready) begin
+            // Increment whenever arithmetic block ready
+            pc_d = pc_q + 1;
+        end else if (state_d != state_q) begin
+            // Load new value when FSM state changes (should not coincide with arith_ready)
+            case (state_d)
+                StPrepare:  pc_d = pc_t'(PREPARE_START);
+                StAdd:      pc_d = pc_t'(POINT_ADD_START);
+                StDouble:   pc_d = pc_t'(POINT_ADD_START);
+                StFinalize: pc_d = pc_t'(FINALIZE_START);
+                default: ; // no need to load for the other states
+            endcase
+        end
+    end
+
+    // -------------------------------------------------------------------------
+    // FSM — combinational next-state and data path
+    // -------------------------------------------------------------------------
+
+    always_comb begin
+        // Outputs (inactive by default)
+        ready        = 1'b0;
+        verif_passed = 1'b0;
+
+        // Simple registers (hold by default)
+        state_d        = state_q;
+        u1_d           = u1_q;
+        u2_d           = u2_q;
+        bit_pos_d      = bit_pos_q;
+        arith_valid_d  = arith_valid_q;
+
+        foreach (reg_file_d[i]) begin
+            reg_file_d[i] = reg_file_q[i];
+        end
+
+        // Handle running the program here centrally for all states
+        if (arith_ready) begin
+
+            // When arith block ready, store result
+            reg_write(current_instr.dst, arith_result);
+
+            // If end of program reached, stop the program
+            if (int'(pc_q) inside {PROGRAM_ENDS}) begin
+                arith_valid_d = 1'b0;
+            end
+        end
+
+        // State machine
+        unique case (state_q)
+            // -----------------------------------------------------------------
+            StIdle: begin
+                if (valid) begin
+                    // Initialize P1 accumulator to point infinity
+                    reg_write(X1, INF_X);
+                    reg_write(Y1, INF_Y);
+                    reg_write(Z1, INF_Z);
+
+                    // Move to next state
+                    state_d = StPrepare;
+                end
+            end
+
+            // -----------------------------------------------------------------
+            StPrepare: begin
+
+                // PC loading handled in separate always_comb
+
+                // If program not started yet, load the inputs and start the program
+                if (!arith_valid_q && int'(pc_q) == PREPARE_START) begin
+
+                    reg_write(T0, s);
+                    reg_write(T1, z);
+                    reg_write(T2, r);
+
+                    arith_valid_d = 1'b1;
+                end
+
+                // Nothing to do here when the program running, it's handled outside the case statement
+
+                // When program finished, store the u1, u2 results, initialize loop counter and move to next state
+                if (!arith_valid_q && int'(pc_q) != PREPARE_START) begin
+                    u1_d      = reg_read(T1);
+                    u2_d      = reg_read(T2);
+                    bit_pos_d = BITCNT_W'(WIDTH-1);
+
+                    state_d = StAdd;
+                end
+            end
+
+            // -----------------------------------------------------------------
+            StAdd: begin
+
+                // PC loading handled in separate always_comb
+
+                // If program not started yet, load the inputs and start the program
+                if (!arith_valid_q && int'(pc_q) == POINT_ADD_START) begin
+                    // P2 = selected_point (for P1 += P2)
+                    reg_write(X2, sel_x);
+                    reg_write(Y2, sel_y);
+                    reg_write(Z2, sel_z);
+
+                    arith_valid_d = 1'b1;
+                end
+
+                // Nothing to do here when the program running, it's handled outside the case statement
+
+                // When program finished, move to next state (results already in the P1 accumulator)
+                if (!arith_valid_q && int'(pc_q) != POINT_ADD_START) begin
+                    // Stop condition: last bit (doubling not needed then)
+                    state_d = (bit_pos_q != '0) ? StDouble : StFinalize;
+                end
+            end
+
+            // -----------------------------------------------------------------
+            StDouble: begin
+
+                // PC loading handled in separate always_comb
+
+                // If program not started yet, load the inputs and start the program
+                if (!arith_valid_q && int'(pc_q) == POINT_ADD_START) begin
+                    // P2 = P1 (for P1 + P2 = 2*P1)
+                    reg_write(X2, reg_read(X1));
+                    reg_write(Y2, reg_read(Y1));
+                    reg_write(Z2, reg_read(Z1));
+
+                    arith_valid_d = 1'b1;
+                end
+
+                // Nothing to do here when the program running, it's handled outside the case statement
+
+                // When program finished, move back to add state and decrement bit counter (results already in the P1 accumulator)
+                if (!arith_valid_q && int'(pc_q) != POINT_ADD_START) begin
+                    bit_pos_d = bit_pos_q - 1;
+                    // Note: could do the same skip StAdd optimization as in mod_mul but
+                    // PC loading does not currently support re-running the same state (StDouble after StDouble)
+                    state_d = StAdd;
+                end
+            end
+
+            // -----------------------------------------------------------------
+            StFinalize: begin
+
+                // PC loading handled in separate always_comb
+
+                // If program not started yet, load the inputs and start the program
+                if (!arith_valid_q && int'(pc_q) == FINALIZE_START) begin
+                    // X1, Y1, Z1 are already in the corresponding registers
+                    reg_write(T2, r);
+
+                    arith_valid_d = 1'b1;
+                end
+
+                // Nothing to do here when the program running, it's handled outside the case statement
+
+                // When program finished, check the result and move back to idle
+                if (!arith_valid_q && int'(pc_q) != FINALIZE_START) begin
+                    ready        = 1'b1;
+                    verif_passed = (reg_read(T0) == '0);
+
+                    state_d = StIdle;
+                end
+            end
+
+            default: ;
+        endcase
+    end
+
+    // -------------------------------------------------------------------------
+    // Sequential: register updates, asynchronous active-low reset
+    // -------------------------------------------------------------------------
+
+    // Register file registers
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            for (int i = 0; i < NUM_REGS; i++) begin
+                reg_file_q[i] <= '0;
+            end
+        end else begin
+            for (int i = 0; i < NUM_REGS; i++) begin
+                reg_file_q[i] <= reg_file_d[i];
+            end
+        end
+    end
+
+    // FSM state register
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) state_q <= StIdle;
+        else        state_q <= state_d;
+    end
+
+    // PC register
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) pc_q <= '0;
+        else        pc_q <= pc_d;
+    end
+
+    // arith_valid register
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) arith_valid_q <= 1'b0;
+        else        arith_valid_q <= arith_valid_d;
+    end
+
+    // u1, u2, and bit_pos registers
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            u1_q           <= '0;
+            u2_q           <= '0;
+            bit_pos_q      <= '0;
+        end else begin
+            u1_q           <= u1_d;
+            u2_q           <= u2_d;
+            bit_pos_q      <= bit_pos_d;
+        end
+    end
+
+endmodule
diff --git a/verilog/rtl/mod_add.sv b/verilog/rtl/mod_add.sv
new file mode 100644
index 0000000..f0405f7
--- /dev/null
+++ b/verilog/rtl/mod_add.sv
@@ -0,0 +1,158 @@
+// Mod_add - Simple modular addition / subtraction
+//
+// Computes (x ± y) mod modulus, where the subtract signal determines the sense of the operation.
+//
+// State machine: StAdd -> StAdjust -> StAdd ...
+// Processing one modular operation in each StAdd -> StAdjust cycle.
+// If there is no new request, the FSM remains in StAdd until a new request arrives.
+
+
+module mod_add
+    import arith_pkg::*; // import in module header to be used in port list
+(
+    input  logic             clk,
+    input  logic             rst_n,
+    input  logic             valid,
+    input  logic [WIDTH-1:0] a,
+    input  logic [WIDTH-1:0] b,
+    input  logic [WIDTH-1:0] modulus,
+    input  logic             subtract,
+
+    output logic             ready,
+    output logic [WIDTH-1:0] result,
+    output logic             adjust
+);
+
+    // FSM enum
+    typedef enum logic {
+        StAdd,
+        StAdjust
+    } state_e;
+
+    // ---------------------------------------------------------------------------
+    // Registers
+    // ---------------------------------------------------------------------------
+
+    // FSM state
+    state_e           state_q;
+    state_e           state_d;
+
+    // Intermediate result
+    logic [WIDTH-1:0] result_ab_q;
+    logic [WIDTH-1:0] result_ab_d;
+
+    // Intermediate carry
+    logic             carry_ab_q;
+    logic             carry_ab_d;
+
+    // ---------------------------------------------------------------------------
+    // Adder instance
+    // ---------------------------------------------------------------------------
+
+    // Adder inputs
+    logic [WIDTH-1:0] adder_a;
+    logic [WIDTH-1:0] adder_b;
+    logic             adder_subtract;
+
+    // Adder outputs
+    logic [WIDTH-1:0] adder_result;
+    logic             adder_carry_out;
+
+    // Additional adder_ready signal considered in the FSM to support sequential adders as well
+    // adder_ready is always 1 (comb_add has no latency)
+    wire adder_ready = 1'b1;
+
+    comb_add u_comb_add (
+        .a         (adder_a),
+        .b         (adder_b),
+        .subtract  (adder_subtract),
+        .result    (adder_result),
+        .carry_out (adder_carry_out)
+    );
+
+    // ---------------------------------------------------------------------------
+    // FSM
+    // ---------------------------------------------------------------------------
+
+    // FSM: combinational next-state, and output decode (including adder inputs) + data registers controlled by the FSM
+    always_comb begin
+        // Defaults
+
+        // Next state (maintain by default)
+        state_d        = state_q;
+
+        // Adder inputs (unsused defaults, always overridden)
+        adder_a        = 'x;
+        adder_b        = 'x;
+        adder_subtract = 1'bx;
+
+        // Module outputs (masked when inactive)
+        ready          = 1'b0;
+        result         = '0;
+        adjust         = 1'b0;
+
+        // Data registers (maintain by default)
+        result_ab_d    = result_ab_q;
+        carry_ab_d     = carry_ab_q;
+
+
+        unique case (state_q)
+            StAdd: begin
+                // Adder computes a ± b
+                adder_a        = a;
+                adder_b        = b;
+                adder_subtract = subtract;
+
+                // sample adder results and move to next state when inputs are valid and adder is ready
+                if (valid && adder_ready) begin
+                    result_ab_d = adder_result;
+                    carry_ab_d  = adder_carry_out;
+                    state_d     = StAdjust;
+                end
+            end
+            StAdjust: begin
+                // Adder computes result_ab ∓ modulus for correction (subtract sense inverted)
+                adder_a        = result_ab_q;
+                adder_b        = modulus;
+                adder_subtract = ~subtract;
+
+                // assign final results when adder is ready
+                // new adder result is discarded if adjust is not needed
+                if (adder_ready) begin
+                    ready    = 1'b1;
+                    // assign adjust first and then reuse for result
+                    adjust   = // when a-b is negative (carry_ab=1)
+                               ( subtract && carry_ab_q) ||
+                               // when a+b overflowed (carry_ab=1), can't rely on adder_carry_out
+                               (!subtract && carry_ab_q) ||
+                               // when a+b did not overflow but subtracting m does not make it negative
+                               (!subtract && ~adder_carry_out);
+                    result   = adjust ? adder_result : result_ab_q;
+
+                    state_d  = StAdd;
+                end
+            end
+            default: ; // empty - defaults are set outside the case statement
+        endcase
+    end
+
+    // Sequential: register updates, asynchronous active-low reset
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            state_q     <= StAdd;
+        end else begin
+            state_q     <= state_d;
+        end
+    end
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            result_ab_q <= '0;
+            carry_ab_q  <= 1'b0;
+        end else begin
+            result_ab_q <= result_ab_d;
+            carry_ab_q  <= carry_ab_d;
+        end
+    end
+
+endmodule
diff --git a/verilog/rtl/mod_inv.sv b/verilog/rtl/mod_inv.sv
new file mode 100644
index 0000000..d496a58
--- /dev/null
+++ b/verilog/rtl/mod_inv.sv
@@ -0,0 +1,308 @@
+// Mod_inv - Modular inverse via Binary Extended GCD
+//
+// Computes a^(-1) mod modulus, or reports that the inverse does not exist.
+// Assumes the modulus is an odd prime (secp256k1 field prime or curve order).
+//
+// Drives an external mod_add instance for all arithmetic; the caller wires
+// mod_add_{valid,a,b,subtract} to the mod_add inputs and feeds
+// mod_add_{result,ready,adjust} back as inputs to this module.
+//
+// Protocol:
+//   1. Assert valid and hold a, modulus stable until ready pulses
+//   2. Wire the external mod_add as directed by mod_add_valid/a/b/subtract
+//   3. Feed mod_add_result, mod_add_ready, mod_add_adjust back as inputs
+//   4. ready pulses high for one cycle when result is available
+//   5. When ready, check exists: 1 = result is the inverse, 0 = no inverse
+//
+// State machine:
+//               _________________________________________________________
+//              |    _______________________                              |
+//              |   |                       |                             |
+//              |   |    -> StDiv2Add -> StDiv2P1                         |
+//              v   v   |                                                 |
+//   StIdle -> StOpSel -|-> StSubRems -> StSubRemsRev (Conditional) -> StSubCoeffs
+//                      |
+//                       -> StDone    -> StIdle
+
+
+module mod_inv
+    import arith_pkg::*; // import in module header to be used in port list
+(
+    input  logic             clk,
+    input  logic             rst_n,
+    // Control
+    input  logic             valid,
+    input  logic [WIDTH-1:0] a,
+    input  logic [WIDTH-1:0] modulus,
+
+    // External mod_add resp
+    input  logic             mod_add_ready,
+    input  logic [WIDTH-1:0] mod_add_result,
+    input  logic             mod_add_adjust,
+
+    // Result
+    output logic             ready,
+    output logic             exists,
+    output logic [WIDTH-1:0] result,
+
+    // External mod_add req
+    output logic             mod_add_valid,
+    output logic [WIDTH-1:0] mod_add_a,
+    output logic [WIDTH-1:0] mod_add_b,
+    output logic             mod_add_subtract
+);
+
+    // FSM states
+    typedef enum logic [2:0] {
+        StIdle,
+        StOpSel,
+        StDiv2Add,
+        StDiv2P1,
+        StSubRems,
+        StSubRemsRev,
+        StSubCoeffs,
+        StDone
+    } state_e;
+
+    // ---------------------------------------------------------------------------
+    // Registers
+    // ---------------------------------------------------------------------------
+
+    // FSM state
+    state_e           state_q,           state_d;
+
+    // Remainders and a's Bezout coefficients
+    // followings must always hold:
+    //      a*s == u mod m (same as a*s + m*x == u mod m)
+    //      a*t == v mod m (same as a*t + m*y == v mod m)
+    // Note: coefficient for the modulus vanishes due to mod m arithmetic, so no need to track those
+    logic [WIDTH-1:0] u_rem_q,           u_rem_d;
+    logic [WIDTH-1:0] v_rem_q,           v_rem_d;
+    logic [WIDTH-1:0] s_coeff_q,         s_coeff_d;
+    logic [WIDTH-1:0] t_coeff_q,         t_coeff_d;
+
+    // Helper flags
+    logic             reduced_unv_q,     reduced_unv_d;     // 1 = u was reduced, 0 = v was reduced
+    logic             div2_unv_q,        div2_unv_d;        // 1 = dividing u/s, 0 = dividing v/t
+    logic             div2_coeff_odd_q,  div2_coeff_odd_d;  // was the original coefficient odd?
+
+    // ---------------------------------------------------------------------------
+    // Combinational helpers
+    // ---------------------------------------------------------------------------
+
+    // Select current coefficient based on div2_unv
+    wire  [WIDTH-1:0] div2_coeff = div2_unv_q ? s_coeff_q : t_coeff_q;
+
+    // ---------------------------------------------------------------------------
+    // FSM — combinational next-state, output decode, data register inputs
+    // ---------------------------------------------------------------------------
+
+    always_comb begin
+        // Outputs (inactive by default)
+        ready  = 1'b0;
+        result = '0;
+        exists = 1'b0;
+
+        // Registers (hold value by default)
+        state_d          = state_q;
+        u_rem_d          = u_rem_q;
+        v_rem_d          = v_rem_q;
+        s_coeff_d        = s_coeff_q;
+        t_coeff_d        = t_coeff_q;
+        reduced_unv_d    = reduced_unv_q;
+        div2_unv_d       = div2_unv_q;
+        div2_coeff_odd_d = div2_coeff_odd_q;
+
+        // mod_add outputs (masked when inactive)
+        mod_add_valid    = 1'b0;
+        mod_add_a        = '0;
+        mod_add_b        = '0;
+        mod_add_subtract = 1'b0;
+
+        unique case (state_q)
+            // -----------------------------------------------------------------
+            StIdle: begin
+                if (valid) begin
+                    u_rem_d   = a;          // u = a
+                    v_rem_d   = modulus;    // v = b = modulus
+                    s_coeff_d = 1;          // a*s == u mod m => s = 1
+                    t_coeff_d = 0;          // a*t == v mod m => t = 0
+                    state_d   = StOpSel;
+                end
+            end
+
+            // -----------------------------------------------------------------
+            StOpSel: begin
+                if (u_rem_q == '0) begin
+                    // Termination: gcd found
+                    state_d  = StDone;
+                end else if (!u_rem_q[0]) begin
+                    // u is even: divide u/s pair
+                    div2_unv_d       = 1'b1;
+                    div2_coeff_odd_d = s_coeff_q[0];
+                    state_d          = StDiv2Add;
+                end else if (!v_rem_q[0]) begin
+                    // v is even: divide v/t pair
+                    div2_unv_d       = 1'b0;
+                    div2_coeff_odd_d = t_coeff_q[0];
+                    state_d          = StDiv2Add;
+                end else begin
+                    // Both odd: subtract remainders
+                    state_d = StSubRems;
+                end
+            end
+
+            // -----------------------------------------------------------------
+            // Div2: divide remainder by 2, adjust coefficient
+            //
+            //   r = r >> 1
+            //   if c is even:  c = c >> 1
+            //   if c is odd:   c = (c >> 1) + (mod >> 1),  then c = c + 1
+            //
+            // The odd case is split across StDiv2Add and StDiv2P1 to avoid
+            // exceeding WIDTH bits in the intermediate (c + mod) value.
+            // -----------------------------------------------------------------
+            StDiv2Add: begin
+                // Drive mod_add: (c >> 1) + (mod >> 1) in case it's needed
+                mod_add_valid    = 1'b1;
+                mod_add_a        = div2_coeff >> 1;
+                mod_add_b        = modulus >> 1;
+                mod_add_subtract = 1'b0;
+
+                if (mod_add_ready) begin
+                    // Shift the remainder and update coefficient based on parity
+                    if (div2_unv_q) begin
+                        u_rem_d   = u_rem_q >> 1;
+                        s_coeff_d = div2_coeff[0] ? mod_add_result : (div2_coeff >> 1);
+                    end else begin
+                        v_rem_d   = v_rem_q >> 1;
+                        t_coeff_d = div2_coeff[0] ? mod_add_result : (div2_coeff >> 1);
+                    end
+                    div2_coeff_odd_d = div2_coeff[0];
+                    state_d          = StDiv2P1;
+                end
+            end
+
+            // -----------------------------------------------------------------
+            StDiv2P1: begin
+                // Drive mod_add: c + 1 in case it's needed
+                mod_add_valid = 1'b1;
+                mod_add_a     = div2_coeff;
+                mod_add_b     = 1;
+
+                if (mod_add_ready) begin
+                    // Apply the +1 only if original coefficient was odd
+                    if (div2_coeff_odd_q) begin
+                        if (div2_unv_q)
+                            s_coeff_d = mod_add_result;
+                        else
+                            t_coeff_d = mod_add_result;
+                    end
+                    state_d = StOpSel;
+                end
+            end
+
+            // -----------------------------------------------------------------
+            StSubRems: begin
+                // Try u - v
+                mod_add_valid    = 1'b1;
+                mod_add_a        = u_rem_q;
+                mod_add_b        = v_rem_q;
+                mod_add_subtract = 1'b1;
+
+                if (mod_add_ready) begin
+                    if (!mod_add_adjust) begin
+                        // No underflow: u >= v
+                        u_rem_d       = mod_add_result;
+                        reduced_unv_d = 1'b1;
+                        state_d       = StSubCoeffs;
+                    end else begin
+                        // Underflow: u < v, need reverse subtraction
+                        state_d = StSubRemsRev;
+                    end
+                end
+            end
+
+            // -----------------------------------------------------------------
+            StSubRemsRev: begin
+                // v - u (guaranteed no underflow)
+                mod_add_valid    = 1'b1;
+                mod_add_a        = v_rem_q;
+                mod_add_b        = u_rem_q;
+                mod_add_subtract = 1'b1;
+
+                if (mod_add_ready) begin
+                    v_rem_d       = mod_add_result;
+                    reduced_unv_d = 1'b0;
+                    state_d       = StSubCoeffs;
+                end
+            end
+
+            // -----------------------------------------------------------------
+            StSubCoeffs: begin
+                // If u was reduced: s = s - t, else: t = t - s
+                mod_add_valid    = 1'b1;
+                mod_add_a        = reduced_unv_q ? s_coeff_q : t_coeff_q;
+                mod_add_b        = reduced_unv_q ? t_coeff_q : s_coeff_q;
+                mod_add_subtract = 1'b1;
+
+                if (mod_add_ready) begin
+                    if (reduced_unv_q)
+                        s_coeff_d = mod_add_result;
+                    else
+                        t_coeff_d = mod_add_result;
+                    state_d = StOpSel;
+                end
+            end
+
+            // -----------------------------------------------------------------
+            StDone: begin
+                state_d = StIdle;
+                ready   = 1'b1;
+                exists  = (v_rem_q == 256'd1); // ignoring m = 0/1 cases (assuming large prime)
+                result  = exists ? t_coeff_q : '0;
+            end
+
+            default: ; // empty — defaults are set outside the case statement
+        endcase
+    end
+
+    // ---------------------------------------------------------------------------
+    // Sequential: register updates, asynchronous active-low reset
+    // ---------------------------------------------------------------------------
+
+    // State register
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) state_q <= StIdle;
+        else        state_q <= state_d;
+    end
+
+    // Remainders and coefficients registers
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            u_rem_q          <= '0;
+            v_rem_q          <= '0;
+            s_coeff_q        <= '0;
+            t_coeff_q        <= '0;
+        end else begin
+            u_rem_q          <= u_rem_d;
+            v_rem_q          <= v_rem_d;
+            s_coeff_q        <= s_coeff_d;
+            t_coeff_q        <= t_coeff_d;
+        end
+    end
+
+    // Helper flags registers
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            reduced_unv_q    <= 1'b0;
+            div2_unv_q       <= 1'b0;
+            div2_coeff_odd_q <= 1'b0;
+        end else begin
+            reduced_unv_q    <= reduced_unv_d;
+            div2_unv_q       <= div2_unv_d;
+            div2_coeff_odd_q <= div2_coeff_odd_d;
+        end
+    end
+
+endmodule
diff --git a/verilog/rtl/mod_mul.sv b/verilog/rtl/mod_mul.sv
new file mode 100644
index 0000000..c1defda
--- /dev/null
+++ b/verilog/rtl/mod_mul.sv
@@ -0,0 +1,182 @@
+// Mod_mul - Modular multiplication via binary shift-and-add (using modular add and double)
+//
+// Computes (a * b) mod modulus.
+// Drives an external mod_add instance for all additions; the caller wires
+// mod_add_{valid,a,b,subtract} to the mod_add inputs and feeds
+// mod_add_{result,ready} back as inputs to this module.
+//
+// Protocol:
+//   1. Assert valid and hold a, b stable until ready pulses
+//   2. Wire the external mod_add as directed by mod_add_valid/a/b/subtract
+//   3. Feed mod_add_result and mod_add_ready back as inputs
+//   4. ready pulses high for one cycle when result is available
+//
+// State machine: StIdle -> StAdd (conditional) -> StDone -> StIdle
+//                            ^            |
+//                            |__StDouble__|
+
+module mod_mul
+    import arith_pkg::*; // import in module header to be used in port list
+(
+    input  logic             clk,
+    input  logic             rst_n,
+    // Control
+    input  logic             valid,
+    // Operands (held stable throughout computation)
+    input  logic [WIDTH-1:0] a,
+    input  logic [WIDTH-1:0] b,
+    // input  logic [WIDTH-1:0] modulus, // feeding the modulus to mod_add is taken care of in the arith block
+
+    // External mod_add resp
+    input  logic             mod_add_ready,
+    input  logic [WIDTH-1:0] mod_add_result,
+
+    // Result
+    output logic             ready,
+    output logic [WIDTH-1:0] result,
+
+    // External mod_add req
+    output logic             mod_add_valid,
+    output logic [WIDTH-1:0] mod_add_a,
+    output logic [WIDTH-1:0] mod_add_b,
+    output logic             mod_add_subtract // always 0 for mod_mul (add and double)
+);
+
+    // FSM states
+    typedef enum logic [1:0] {
+        StIdle,
+        StAdd,
+        StDouble,
+        StDone
+    } state_e;
+
+    // ---------------------------------------------------------------------------
+    // Registers
+    // ---------------------------------------------------------------------------
+
+    // FSM state
+    state_e           state_q,             state_d;
+
+    // REVISIT - could use MSB first shift-and-add to avoid having a register for the multiplicand
+    // multiplicand "left-shifted", actually doubled via modular self-add (no real shifting happens)
+    logic [WIDTH-1:0] multiplicand_lsh_q,  multiplicand_lsh_d;
+
+    // REVISIT - could also use a mux to index the multiplier bits (though that is also significant gate count)
+    // multiplier right-shifted, here we do real shifting and always check the LSB only
+    logic [WIDTH-1:0] multiplier_rsh_q,    multiplier_rsh_d;
+
+    // Accumulates the result after each addition step; holds the final result at the end
+    logic [WIDTH-1:0] result_acc_q,        result_acc_d;
+
+    // ---------------------------------------------------------------------------
+    // FSM — combinational next-state, output decode, data register inputs
+    // ---------------------------------------------------------------------------
+
+    always_comb begin
+        // Outputs (inactive by default)
+        ready               = 1'b0;
+        result              = '0;
+
+        // Registers (hold value by default)
+        state_d             = state_q;
+        multiplicand_lsh_d  = multiplicand_lsh_q;
+        multiplier_rsh_d    = multiplier_rsh_q;
+        result_acc_d        = result_acc_q;
+
+        // mod_add outputs (masked when inactive)
+        mod_add_valid    = 1'b0;
+        mod_add_a        = '0;
+        mod_add_b        = '0;
+        mod_add_subtract = 1'b0;
+
+        unique case (state_q)
+            StIdle: begin
+                if (valid) begin
+                    multiplicand_lsh_d  = a;
+                    multiplier_rsh_d    = b;
+                    result_acc_d        = '0;
+                    state_d             = StAdd;
+                end
+            end
+
+            StAdd: begin
+                // Drive mod_add: acc + multiplicand_lsh
+                mod_add_valid    = 1'b1;
+                mod_add_a        = result_acc_q;
+                mod_add_b        = multiplicand_lsh_q;
+                mod_add_subtract = 1'b0;
+
+                if (mod_add_ready) begin
+                    if (multiplier_rsh_q[0]) begin
+                        result_acc_d = mod_add_result;
+                    end
+
+                    // Check stop condition (are all other multiplier bits zero?)
+                    state_d = (multiplier_rsh_q[WIDTH-1:1] != '0) ? StDouble : StDone;
+                end
+            end
+
+            StDouble: begin
+                // Drive mod_add: multiplicand_lsh * 2 (via self-add)
+                mod_add_valid    = 1'b1;
+                mod_add_a        = multiplicand_lsh_q;
+                mod_add_b        = multiplicand_lsh_q;
+                mod_add_subtract = 1'b0;
+
+                if (mod_add_ready) begin
+                    multiplicand_lsh_d = mod_add_result;
+                    multiplier_rsh_d   = multiplier_rsh_q >> 1;
+
+                    // Optimization: skip StAdd when next LSB=0
+                    // Simulation time and run cycles saving might be significant
+                    state_d = multiplier_rsh_q[1] ? StAdd : StDouble;
+                end
+            end
+
+            StDone: begin
+                state_d = StIdle;
+                ready   = 1'b1;
+                result  = result_acc_q;
+            end
+
+            default: ; // empty — defaults are set outside the case statement
+        endcase
+    end
+
+    // ---------------------------------------------------------------------------
+    // Sequential: register updates, asynchronous active-low reset
+    // ---------------------------------------------------------------------------
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            state_q <= StIdle;
+        end else begin
+            state_q <= state_d;
+        end
+    end
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            multiplicand_lsh_q <= '0;
+        end else begin
+            multiplicand_lsh_q <= multiplicand_lsh_d;
+        end
+    end
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            multiplier_rsh_q <= '0;
+        end else begin
+            multiplier_rsh_q <= multiplier_rsh_d;
+        end
+    end
+
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            result_acc_q <= '0;
+        end else begin
+            result_acc_q <= result_acc_d;
+        end
+    end
+
+endmodule
diff --git a/verilog/rtl/secp256k1_pkg.sv b/verilog/rtl/secp256k1_pkg.sv
new file mode 100644
index 0000000..968abb6
--- /dev/null
+++ b/verilog/rtl/secp256k1_pkg.sv
@@ -0,0 +1,28 @@
+package secp256k1_pkg;
+
+    import arith_pkg::*;
+
+    // -------------------------------------------------------------------------
+    // secp256k1 constants
+    // -------------------------------------------------------------------------
+
+    // field prime: p = 2^256 - 2^32 - 977
+    // factored out 2**32 to avoid 2**256 overflow
+    localparam logic [WIDTH-1:0] PRIME_P =
+        256'd2**32 * (256'd2**224 - 256'd1) - 256'd977;
+
+    // scurve order n (no closed form exists)
+    localparam logic [WIDTH-1:0] PRIME_N =
+        256'hFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE_BAAEDCE6AF48A03BBFD25E8CD0364141;
+
+    // curve parameters: y² = x³ + ax + b, a=0, b=7
+    localparam logic [WIDTH-1:0] CURVE_A1 = 1 * 0;  // 1*a
+    localparam logic [WIDTH-1:0] CURVE_B3 = 3 * 7;  // 3*b
+
+    // Generator point G
+    localparam logic [WIDTH-1:0]
+    G_X = 256'h79BE667EF9DCBBAC55A06295CE870B07029BFCDB2DCE28D959F2815B16F81798,
+    G_Y = 256'h483ADA7726A3C4655DA4FBFC0E1108A8FD17B448A68554199C47D08FFB10D4B8,
+    G_Z = 1; // projective coordinate for affine points have Z=1
+
+endpackage
diff --git a/verilog/rtl/security_block.sv b/verilog/rtl/security_block.sv
new file mode 100644
index 0000000..2cb182e
--- /dev/null
+++ b/verilog/rtl/security_block.sv
@@ -0,0 +1,257 @@
+// Security Block
+//
+// Manages ECDSA-based license validation, a TRNG nonce source, an allowance
+// counter, and a gated workload unit.
+//
+// Protocol:
+//   1. On startup, waits INIT_DELAY then generates an initial nonce
+//   2. nonce_ready pulses when a fresh nonce is available in nonce[]
+//   3. Submit a license via valid-ready handshake: assert license_valid with
+//      (license_r, license_s); transfer completes when license_ready is high.
+//      The signature must be over the current nonce as the message hash.
+//   4. On valid license: allowance += ALLOWANCE_INCREMENT (saturating), new nonce
+//      On invalid license: same nonce retained, can retry
+//   5. Workload (signed 8-bit add) is gated: result is zeroed when allowance == 0
+//   6. Allowance decrements by 1 every cycle while > 0
+
+module security_block
+    import arith_pkg::*;
+# (
+    localparam int unsigned ALLOW_W  = 64,
+    localparam int unsigned WORKLD_W =  8
+)(
+    input  logic             clk,
+    input  logic             rst_n,
+
+    // License interface (valid-ready)
+    input  logic             license_valid,
+    output logic             license_ready,
+    input  logic [WIDTH-1:0] license_r,
+    input  logic [WIDTH-1:0] license_s,
+
+    // Workload interface
+    input  logic                workload_valid,
+    input  logic [WORKLD_W-1:0] workload_a,
+    input  logic [WORKLD_W-1:0] workload_b,
+
+    // TRNG seed (for simulation)
+    input  logic [WIDTH-1:0] trng_seed,
+    input  logic             trng_load_seed,
+
+    // Outputs
+    output logic [WIDTH-1:0]    nonce,
+    output logic                nonce_ready,
+    output logic [WORKLD_W-1:0] workload_result,
+    output logic                result_valid,
+    output logic [ALLOW_W-1:0]  allowance,
+    output logic                enabled
+);
+
+    // -------------------------------------------------------------------------
+    // Constants
+    // -------------------------------------------------------------------------
+
+    localparam int INIT_DELAY = 100;
+    localparam int DELAYCNT_W = $clog2(INIT_DELAY); // delay counter width
+
+    localparam logic [ALLOW_W-1:0]  ALLOWANCE_INCREMENT = 64'd1_000_000_000_000;
+
+    // -------------------------------------------------------------------------
+    // FSM states
+    // -------------------------------------------------------------------------
+
+    typedef enum logic [2:0] {
+        StInitDelay,
+        StRequestNonce,
+        StPublishAndWait,
+        StWaitVerify
+    } state_e;
+
+    // -------------------------------------------------------------------------
+    // Registers
+    // -------------------------------------------------------------------------
+
+    state_e                 state_q,            state_d;
+    logic [ALLOW_W-1:0]     allowance_q,        allowance_d;
+    logic                   result_valid_q,     result_valid_d;
+    logic [WORKLD_W-1:0]    workload_result_q,  workload_result_d;
+    logic [DELAYCNT_W-1:0]  delay_cnt_q,        delay_cnt_d;  // counts init delay
+
+
+    // -------------------------------------------------------------------------
+    // TRNG instance
+    // -------------------------------------------------------------------------
+
+    logic             trng_request_new;
+    logic [WIDTH-1:0] trng_nonce;
+    logic             trng_nonce_valid;
+
+    trng u_trng (
+        .clk         (clk),
+        .rst_n       (rst_n),
+        .enable      (1'b1),
+        .request_new (trng_request_new),
+        .load_seed   (trng_load_seed),
+        .seed        (trng_seed),
+        .nonce       (trng_nonce),
+        .nonce_valid (trng_nonce_valid)
+    );
+
+    // -------------------------------------------------------------------------
+    // ECDSA instance
+    // -------------------------------------------------------------------------
+
+    // Input valid to be driven from the FSM
+    logic             ecdsa_valid;
+
+    // Outputs
+    logic             ecdsa_ready;
+    logic             ecdsa_verif_passed;
+
+    ecdsa u_ecdsa (
+        .clk          (clk),
+        .rst_n        (rst_n),
+        .valid        (ecdsa_valid),
+        .z            (trng_nonce),
+        .r            (license_r),
+        .s            (license_s),
+        .ready        (ecdsa_ready),
+        .verif_passed (ecdsa_verif_passed)
+    );
+
+    // -------------------------------------------------------------------------
+    // Allowance — combinational next value
+    // -------------------------------------------------------------------------
+
+    logic increment_allowance;
+
+    // one bit wider for overflow check
+    wire [ALLOW_W:0]  allowance_sum = {1'b0, allowance_q} + {1'b0, ALLOWANCE_INCREMENT};
+
+    always_comb begin
+        if (increment_allowance)
+            allowance_d = !allowance_sum[ALLOW_W] ? allowance_sum[ALLOW_W-1:0] : '1; // sum if no overflow, else max value (all 1s)
+        else if (allowance_q != 0)
+            allowance_d = allowance_q - 1;
+        else
+            allowance_d = '0;
+    end
+
+    // -------------------------------------------------------------------------
+    // Workload — combinational, pipelined one cycle
+    // -------------------------------------------------------------------------
+
+    assign workload_result_d = {WORKLD_W{enabled}} & (workload_a + workload_b);
+    assign result_valid_d    = workload_valid;
+
+    // -------------------------------------------------------------------------
+    // FSM — combinational
+    // -------------------------------------------------------------------------
+
+    always_comb begin
+        // Register input defaults
+        state_d             = state_q;
+        delay_cnt_d         = delay_cnt_q;
+
+        // Combinational signal defaults
+        trng_request_new    = 1'b0;
+        nonce_ready         = 1'b0;
+        nonce               =   '0;
+        ecdsa_valid         = 1'b0;
+        license_ready       = 1'b0;
+        increment_allowance = 1'b0;
+
+        unique case (state_q)
+
+            StInitDelay: begin
+                delay_cnt_d = delay_cnt_q + 1;
+                if (int'(delay_cnt_q) >= INIT_DELAY)
+                    state_d = StRequestNonce;
+            end
+
+            StRequestNonce: begin
+                trng_request_new = 1'b1;
+                state_d          = StPublishAndWait;
+            end
+
+            StPublishAndWait: begin
+                if (trng_nonce_valid) begin
+                    nonce_ready = 1;
+                    nonce       = trng_nonce;
+                    if (license_valid) begin
+                        state_d = StWaitVerify;
+                    end
+                end
+            end
+
+            StWaitVerify: begin
+                ecdsa_valid = 1'b1;
+                if (ecdsa_ready) begin
+                    license_ready = 1'b1;
+                    if (ecdsa_verif_passed) begin
+                        increment_allowance = 1'b1;
+                        state_d             = StRequestNonce;
+                    end else begin
+                        state_d = StPublishAndWait;
+                    end
+                end
+            end
+
+            default: ;
+        endcase
+    end
+
+    // -------------------------------------------------------------------------
+    // Assign register based outputs
+    // -------------------------------------------------------------------------
+
+    assign allowance = allowance_q;
+    assign enabled   = (allowance_q != 0) ? 1'b1 : 1'b0;
+
+    assign workload_result = workload_result_q;
+    assign result_valid    = result_valid_q;
+
+    // -------------------------------------------------------------------------
+    // Sequential
+    // -------------------------------------------------------------------------
+
+    // FSM state register
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            state_q <= StInitDelay;
+        end else begin
+            state_q <= state_d;
+        end
+    end
+
+    // Allowance register
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            allowance_q <= '0;
+        end else begin
+            allowance_q <= allowance_d;
+        end
+    end
+
+    // Workload result pipeline registers
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            result_valid_q <= 1'b0;
+            workload_result_q  <= '0;
+        end else begin
+            result_valid_q <= result_valid_d;
+            workload_result_q  <= workload_result_d;
+        end
+    end
+
+    // Init delay counter
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            delay_cnt_q <= '0;
+        end else begin
+            delay_cnt_q <= delay_cnt_d;
+        end
+    end
+
+
+endmodule
diff --git a/verilog/rtl/trng.sv b/verilog/rtl/trng.sv
new file mode 100644
index 0000000..1868a37
--- /dev/null
+++ b/verilog/rtl/trng.sv
@@ -0,0 +1,57 @@
+// TRNG - True Random Number Generator (counter-based prototype)
+//
+// Generates 256-bit nonces for use as ECDSA message hashes.
+// In production this would use a ring oscillator (e.g. Vasyltsov et al.);
+// here a free-running counter is used for deterministic simulation.
+//
+// Protocol:
+//   1. Assert enable to run the counter
+//   2. Pulse request_new to latch the current counter value; nonce_valid rises
+//   3. nonce is stable until the next request_new pulse
+//   4. For simulation: assert load_seed for one cycle to seed the counter
+
+module trng
+    import arith_pkg::*; // import in module header to be used in port list
+(
+    input  logic             clk,
+    input  logic             rst_n,
+    input  logic             enable,
+    input  logic             request_new,
+    input  logic             load_seed,
+    input  logic [WIDTH-1:0] seed,
+
+    output logic [WIDTH-1:0] nonce,
+    output logic             nonce_valid
+);
+
+    logic [WIDTH-1:0] counter_q;
+    logic [WIDTH-1:0] nonce_q;
+    logic             nonce_valid_q;
+
+    // counter
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            counter_q     <= '0;
+        end else begin
+            if      (load_seed) counter_q <= seed;
+            else if (enable)    counter_q <= counter_q + 1;
+        end
+    end
+
+    // sampling
+    always_ff @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            nonce_q       <= '0;
+            nonce_valid_q <= 1'b0;
+        end else begin
+            if (request_new) begin
+                nonce_q       <= counter_q;
+                nonce_valid_q <= 1'b1;
+            end
+        end
+    end
+
+    assign nonce       = nonce_q;
+    assign nonce_valid = nonce_valid_q;
+
+endmodule
diff --git a/verilog/tb/sim_main.cpp b/verilog/tb/sim_main.cpp
new file mode 100644
index 0000000..ff7bf69
--- /dev/null
+++ b/verilog/tb/sim_main.cpp
@@ -0,0 +1,62 @@
+#include "Vtb.h"
+#include "verilated.h"
+#if VM_TRACE
+#include "verilated_fst_c.h"
+#endif
+#include <iostream>
+
+int main(int argc, char** argv) {
+    VerilatedContext* ctx = new VerilatedContext;
+    ctx->commandArgs(argc, argv);
+
+#if VM_TRACE
+    ctx->traceEverOn(true);
+#endif
+
+    Vtb* tb = new Vtb{ctx};
+
+#if VM_TRACE
+    VerilatedFstC* fst = new VerilatedFstC;
+    tb->trace(fst, 99);
+    fst->open("dump.fst");
+    #define WAVE_DUMP(t) fst->dump(t)
+#else
+    #define WAVE_DUMP(t)
+#endif
+
+    // initialize rst and clk
+    tb->rst_n = 1;
+    tb->clk = 0;
+    tb->eval();
+    WAVE_DUMP(ctx->time());
+
+    // run 1 cycle
+    tb->clk = 1; tb->eval(); ctx->timeInc(1); WAVE_DUMP(ctx->time());
+    tb->clk = 0; tb->eval(); ctx->timeInc(1); WAVE_DUMP(ctx->time());
+
+    // Assert reset
+    tb->rst_n = 0;
+
+    // run 2 cycles
+    tb->clk = 1; tb->eval(); ctx->timeInc(1); WAVE_DUMP(ctx->time());
+    tb->clk = 0; tb->eval(); ctx->timeInc(1); WAVE_DUMP(ctx->time());
+    tb->clk = 1; tb->eval(); ctx->timeInc(1); WAVE_DUMP(ctx->time());
+    tb->clk = 0; tb->eval(); ctx->timeInc(1); WAVE_DUMP(ctx->time());
+
+    // Deassert reset
+    tb->rst_n = 1;
+
+    // Run the test
+    for (int i = 0; i < 50000000 && !ctx->gotFinish(); i++) {
+        tb->clk = 1; tb->eval(); ctx->timeInc(1); WAVE_DUMP(ctx->time());
+        tb->clk = 0; tb->eval(); ctx->timeInc(1); WAVE_DUMP(ctx->time());
+    }
+
+#if VM_TRACE
+    fst->close();
+    delete fst;
+#endif
+    delete tb;
+    delete ctx;
+    return 0;
+}
diff --git a/verilog/tb/tb_arith.sv b/verilog/tb/tb_arith.sv
new file mode 100644
index 0000000..4e97ab8
--- /dev/null
+++ b/verilog/tb/tb_arith.sv
@@ -0,0 +1,248 @@
+module tb (
+    input logic clk,
+    input logic rst_n
+);
+
+    // ---------------------------------------------------------------------------
+    // Test vector type and array
+    // ---------------------------------------------------------------------------
+
+    typedef struct {
+        string        name;
+        logic [255:0] a;
+        logic [255:0] b;
+        logic [1:0]   op;        // 0=add, 1=sub, 2=mul, 3=inv (b ignored)
+        logic         prime_sel; // 0=prime_p, 1=prime_n
+        logic [255:0] expected;
+    } test_vec_t;
+
+    // secp256k1 field prime: p = 2^256 - 2^32 - 977
+    localparam logic [255:0] P =
+        256'hFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F;
+    // secp256k1 curve order n
+    localparam logic [255:0] N =
+        256'hFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141;
+
+    // ---------------------------------------------------------------------------
+    // Active tests: add + sub + mul
+    // When inv is implemented: append inv vectors and update NumTests
+    // ---------------------------------------------------------------------------
+
+    localparam int NumTests = 24;
+
+    localparam test_vec_t TESTS [NumTests] = '{
+
+        // --- Addition ---
+        '{name: "Add: 100 + 200 mod p",
+          a: 256'd100, b: 256'd200, op: 2'd0, prime_sel: 1'b0,
+          expected: 256'd300},
+
+        '{name: "Add: (p-63) + 100 mod p (wrap)",
+          a: P - 256'd63, b: 256'd100, op: 2'd0, prime_sel: 1'b0,
+          expected: 256'd37},
+
+        '{name: "Add: 12345 + 0 mod p",
+          a: 256'd12345, b: 256'd0, op: 2'd0, prime_sel: 1'b0,
+          expected: 256'd12345},
+
+        '{name: "Add: 100 + 200 mod n",
+          a: 256'd100, b: 256'd200, op: 2'd0, prime_sel: 1'b1,
+          expected: 256'd300},
+
+        // --- Subtraction ---
+        '{name: "Sub: 500 - 300 mod p",
+          a: 256'd500, b: 256'd300, op: 2'd1, prime_sel: 1'b0,
+          expected: 256'd200},
+
+        '{name: "Sub: 100 - 200 mod p (wrap)",
+          a: 256'd100, b: 256'd200, op: 2'd1, prime_sel: 1'b0,
+          expected: P - 256'd100},
+
+        '{name: "Sub: (p-1) - 10 mod p",
+          a: P - 256'd1, b: 256'd10, op: 2'd1, prime_sel: 1'b0,
+          expected: P - 256'd11},
+
+        '{name: "Sub: 12345 - 0 mod p",
+          a: 256'd12345, b: 256'd0, op: 2'd1, prime_sel: 1'b0,
+          expected: 256'd12345},
+
+        '{name: "Sub: 10 - 20 mod n (wrap)",
+          a: 256'd10, b: 256'd20, op: 2'd1, prime_sel: 1'b1,
+          expected: N - 256'd10},
+
+        // --- Multiplication (op=2) ---
+        '{name: "Mul: 3 * 5 mod p",
+          a: 256'd3, b: 256'd5, op: 2'd2, prime_sel: 1'b0,
+          expected: 256'h000000000000000000000000000000000000000000000000000000000000000f},
+
+        '{name: "Mul: 12345 * 0 mod p",
+          a: 256'd12345, b: 256'd0, op: 2'd2, prime_sel: 1'b0,
+          expected: 256'h0000000000000000000000000000000000000000000000000000000000000000},
+
+        '{name: "Mul: 12345 * 1 mod p",
+          a: 256'd12345, b: 256'd1, op: 2'd2, prime_sel: 1'b0,
+          expected: 256'h0000000000000000000000000000000000000000000000000000000000003039},
+
+        '{name: "Mul: 123456 * 789012 mod p",
+          a: 256'd123456, b: 256'd789012, op: 2'd2, prime_sel: 1'b0,
+          expected: 256'h00000000000000000000000000000000000000000000000000000016adfc2d00},
+
+        '{name: "Mul: 64-bit operands mod p",
+          a: 256'd12345678901234, b: 256'd98765432109876, op: 2'd2, prime_sel: 1'b0,
+          expected: 256'h000000000000000000000000000000000000000003f09a63c9ae1be72ffc8328},
+
+        '{name: "Mul: 128-bit operands mod p",
+          a: 256'd123456789012345678901234567890,
+          b: 256'd987654321098765432109876543210, op: 2'd2, prime_sel: 1'b0,
+          expected: 256'h00000000000000136ccc118300207d2e6cfe0022e5d56a89116ec6de5d5f3ff4},
+
+        '{name: "Mul: 256-bit operands mod p",
+          a: 256'd123456789012345678901234567890123456789,
+          b: 256'd987654321098765432109876543210987654321, op: 2'd2, prime_sel: 1'b0,
+          expected: 256'h0d936c6dd454c29c60200b8f07db6a2cc48ee37874bd5a6e7df6807e34223f56},
+
+        '{name: "Mul: 12345 * 67890 mod n",
+          a: 256'd12345, b: 256'd67890, op: 2'd2, prime_sel: 1'b1,
+          expected: 256'h0000000000000000000000000000000000000000000000000000000031f46c22},
+
+        // --- Inversion (op=3, b ignored) ---
+        '{name: "Inv: 3^-1 mod p",
+          a: 256'd3, b: 256'd0, op: 2'd3, prime_sel: 1'b0,
+          expected: 256'haaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa9fffffd75},
+
+        '{name: "Inv: 1^-1 mod p",
+          a: 256'd1, b: 256'd0, op: 2'd3, prime_sel: 1'b0,
+          expected: 256'h0000000000000000000000000000000000000000000000000000000000000001},
+
+        '{name: "Inv: (p-1)^-1 mod p",
+          a: P - 256'd1, b: 256'd0, op: 2'd3, prime_sel: 1'b0,
+          expected: 256'hfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2e},
+
+        '{name: "Inv: 64-bit operand mod p",
+          a: 256'd123456789012345, b: 256'd0, op: 2'd3, prime_sel: 1'b0,
+          expected: 256'h43935996906f5d218e1ec367f09936b53fc2d144ffe34e491ea06c94d9c5b23a},
+
+        '{name: "Inv: 128-bit operand mod p",
+          a: 256'd123456789012345678901234567890, b: 256'd0, op: 2'd3, prime_sel: 1'b0,
+          expected: 256'hfe9887f806cf8d2b479104f140d50f5ad3564dbf2da0aac4102af985dcfbdcc1},
+
+        '{name: "Inv: 256-bit operand mod p",
+          a: 256'd12345678901234567890123456789012345678901234567890,
+          b: 256'd0, op: 2'd3, prime_sel: 1'b0,
+          expected: 256'he283f7a0797a92877a86eafa0c633f36504f8c6dc2fcc48c94784d7b6b356746},
+
+        '{name: "Inv: 999999999999999999^-1 mod n",
+          a: 256'd999999999999999999, b: 256'd0, op: 2'd3, prime_sel: 1'b1,
+          expected: 256'h770324249fefd1cf9af30bf8abb7b824d83d80511a9c7f91c354d804d1eb0322}
+    };
+
+    // TB signal to avoid execution before reset
+    logic reset_done = 0;
+
+    // ---------------------------------------------------------------------------
+    // DUT
+    // ---------------------------------------------------------------------------
+
+    // Driven exclusively by the always block below
+    logic         valid     = 1'b0;
+    logic [1:0]   op        = '0;
+    logic         prime_sel = 1'b0;
+    logic [255:0] a         = '0;
+    logic [255:0] b         = '0;
+
+    logic         ready;
+    logic [255:0] result;
+
+    arith u_dut (
+        .clk       (clk),
+        .rst_n     (rst_n),
+        .valid     (valid),
+        .op        (op),
+        .modulus   (prime_sel ? N : P),
+        .a         (a),
+        .b         (b),
+        .ready     (ready),
+        .result    (result)
+    );
+
+    // ---------------------------------------------------------------------------
+    // Test sequencer — sole driver of DUT inputs
+    // ---------------------------------------------------------------------------
+
+    // using separate pointers for request and response to allow back-to-back testing
+    int next_req_ptr = 0; // Next request to be driven to the DUT
+    int curr_rsp_ptr = 0; // Current response driven by the DUT when ready = 1
+    int pass_count   = 0;
+    int fail_count   = 0;
+    int cycles       = 0; // number of cycles with valid asserted for the current request
+
+    // Using always block to allow non-blocking assignment of the DUT inputs in Verilator
+    always @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            reset_done <= 1'b1;
+
+            valid     <= 1'b0;
+            op        <= '0;
+            prime_sel <= 1'b0;
+            a         <= '0;
+            b         <= '0;
+
+        end else if (reset_done) begin
+
+            // Drive next request (or idle)
+            if (!valid || ready) begin
+                if (next_req_ptr < NumTests) begin
+                    valid        <= 1'b1;
+                    cycles       <= 1;
+                    op           <= TESTS[next_req_ptr].op;
+                    prime_sel    <= TESTS[next_req_ptr].prime_sel;
+                    a            <= TESTS[next_req_ptr].a;
+                    b            <= TESTS[next_req_ptr].b;
+                    next_req_ptr <= next_req_ptr + 1;
+                end else begin
+                    valid  <= 1'b0;
+                    cycles <= 0;
+                end
+            end
+
+            // Check response and handle timeout
+            if (ready) begin
+                // Result available — check and advance
+                if (result === TESTS[curr_rsp_ptr].expected) begin
+                    $display("PASS  [%s] — %0d cycle(s)",
+                             TESTS[curr_rsp_ptr].name, cycles);
+                    pass_count <= pass_count + 1;
+                end else begin
+                    $display("FAIL  [%s]", TESTS[curr_rsp_ptr].name);
+                    $display("      expected: %h", TESTS[curr_rsp_ptr].expected);
+                    $display("      actual:   %h", result);
+                    fail_count <= fail_count + 1;
+                end
+
+                curr_rsp_ptr <= curr_rsp_ptr + 1;
+
+            end else if (valid) begin
+                // Waiting — increment cycle counter and watch for timeout
+                cycles <= cycles + 1;
+                if (cycles > 4000) begin
+                    $display("FAIL  [%s] — timeout", TESTS[curr_rsp_ptr].name);
+                    fail_count      <= fail_count + 1;
+                    curr_rsp_ptr    <= curr_rsp_ptr + 1;
+                    valid           <= 1'b0;
+                end
+            end
+
+            // All tests complete
+            if (curr_rsp_ptr == NumTests) begin
+                $display("");
+                if (fail_count == 0)
+                    $display("All %0d arith tests passed.", pass_count);
+                else
+                    $display("arith: %0d passed, %0d failed.", pass_count, fail_count);
+                $finish;
+            end
+
+        end
+    end
+
+endmodule
diff --git a/verilog/tb/tb_ecdsa.sv b/verilog/tb/tb_ecdsa.sv
new file mode 100644
index 0000000..4096146
--- /dev/null
+++ b/verilog/tb/tb_ecdsa.sv
@@ -0,0 +1,183 @@
+module tb (
+    input logic clk,
+    input logic rst_n
+);
+
+    // -------------------------------------------------------------------------
+    // Test vector type
+    // -------------------------------------------------------------------------
+
+    typedef struct {
+        string        name;
+        logic [255:0] z;
+        logic [255:0] r;
+        logic [255:0] s;
+        logic         expect_verif_passed;
+    } test_vec_t;
+
+    // -------------------------------------------------------------------------
+    // Test vectors (r, s pre-computed from ECDSA signing with d=2, Q=2G)
+    // -------------------------------------------------------------------------
+
+    localparam int NumTests = 8;
+
+    localparam test_vec_t TESTS [NumTests] = '{
+
+        // Test 1: Valid signature (z=12345, k=7)
+        '{name: "Valid: z=12345, k=7",
+          z: 256'h0000000000000000000000000000000000000000000000000000000000003039,
+          r: 256'h5cbdf0646e5db4eaa398f365f2ea7a0e3d419b7e0330e39ce92bddedcac4f9bc,
+          s: 256'hf5ed201cb1d1a1679c74d7d3fc42fe4c1f3ae9c52970ca600b9c474eec66cf51,
+          expect_verif_passed: 1'b1},
+
+        // Test 2: Valid signature (z=0xDEADBEEF, k=0x123456)
+        '{name: "Valid: z=0xDEADBEEF, k=0x123456",
+          z: 256'h00000000000000000000000000000000000000000000000000000000DEADBEEF,
+          r: 256'hf8ccf508990ceef9e5b84f5aeb9fee1739d3d3b140fc05e5b2ff58524c660ba2,
+          s: 256'h98c86259b3f72418d11058c5ec03fc5dca499123880da2d4a989089afef4be26,
+          expect_verif_passed: 1'b1},
+
+        // Test 3: Valid signature (256-bit z, k)
+        '{name: "Valid: 256-bit z, k",
+          z: 256'hb94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9,
+          r: 256'hd595ec6770e9878b6ee380665e7f6785f32cf6a2f2b31343504c4e9e96622ff0,
+          s: 256'h8ce29a99cf1a9f77a7cd29fbd9f240b76b3215222b10ce7fa184083b2782a379,
+          expect_verif_passed: 1'b1},
+
+        // Test 4: Invalid — wrong message hash (z=99999 instead of 12345)
+        '{name: "Invalid: wrong z",
+          z: 256'h000000000000000000000000000000000000000000000000000000000001869F,
+          r: 256'h5cbdf0646e5db4eaa398f365f2ea7a0e3d419b7e0330e39ce92bddedcac4f9bc,
+          s: 256'hf5ed201cb1d1a1679c74d7d3fc42fe4c1f3ae9c52970ca600b9c474eec66cf51,
+          expect_verif_passed: 1'b0},
+
+        // Test 5: Invalid — wrong r (r=11111 instead of real r)
+        '{name: "Invalid: wrong r",
+          z: 256'h0000000000000000000000000000000000000000000000000000000000003039,
+          r: 256'h0000000000000000000000000000000000000000000000000000000000002B67,
+          s: 256'hf5ed201cb1d1a1679c74d7d3fc42fe4c1f3ae9c52970ca600b9c474eec66cf51,
+          expect_verif_passed: 1'b0},
+
+        // Test 6: Invalid — wrong s (s=22222 instead of real s)
+        '{name: "Invalid: wrong s",
+          z: 256'h0000000000000000000000000000000000000000000000000000000000003039,
+          r: 256'h5cbdf0646e5db4eaa398f365f2ea7a0e3d419b7e0330e39ce92bddedcac4f9bc,
+          s: 256'h00000000000000000000000000000000000000000000000000000000000056CE,
+          expect_verif_passed: 1'b0},
+
+        // Test 7: Valid signature (z=0xCAFEBABE, k=0x999)
+        '{name: "Valid: z=0xCAFEBABE, k=0x999",
+          z: 256'h00000000000000000000000000000000000000000000000000000000CAFEBABE,
+          r: 256'h2e43be7a12916cf6f312a513fcb6c98b708ce2dd18dc4ebf72a807c9c8a31b0d,
+          s: 256'h8f67ef46a32e112d1b99b2d2e6adbb9a55e2b8894a1dcecec8e039f56b5eb2f8,
+          expect_verif_passed: 1'b1},
+
+        // Test 8: Invalid — random z/r/s
+        '{name: "Invalid: random z/r/s",
+          z: 256'h0000000000000000000000000000000000000000000000001111111111111111,
+          r: 256'h0000000000000000000000000000000000000000000000002222222222222222,
+          s: 256'h0000000000000000000000000000000000000000000000003333333333333333,
+          expect_verif_passed: 1'b0}
+    };
+
+    // TB signal to avoid execution before reset
+    logic reset_done = 0;
+
+    // -------------------------------------------------------------------------
+    // DUT
+    // -------------------------------------------------------------------------
+
+    logic         valid      = 1'b0;
+    logic [255:0] dut_z      = '0;
+    logic [255:0] dut_r      = '0;
+    logic [255:0] dut_s      = '0;
+
+    logic         ready;
+    logic         verif_passed;
+
+    ecdsa u_dut (
+        .clk       (clk),
+        .rst_n     (rst_n),
+        .valid     (valid),
+        .z         (dut_z),
+        .r         (dut_r),
+        .s         (dut_s),
+        .ready     (ready),
+        .verif_passed (verif_passed)
+    );
+
+    // -------------------------------------------------------------------------
+    // Test sequencer
+    // -------------------------------------------------------------------------
+
+    int next_req_ptr = 0;
+    int curr_rsp_ptr = 0;
+    int pass_count   = 0;
+    int fail_count   = 0;
+    int cycles       = 0;
+
+    always @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            reset_done <= 1'b1;
+
+            valid  <= 1'b0;
+            dut_z  <= '0;
+            dut_r  <= '0;
+            dut_s  <= '0;
+
+        end else if (reset_done) begin
+
+            // Drive next request (or idle)
+            if (!valid || ready) begin
+                if (next_req_ptr < NumTests) begin
+                    valid        <= 1'b1;
+                    cycles       <= 1;
+                    dut_z        <= TESTS[next_req_ptr].z;
+                    dut_r        <= TESTS[next_req_ptr].r;
+                    dut_s        <= TESTS[next_req_ptr].s;
+                    next_req_ptr <= next_req_ptr + 1;
+                end else begin
+                    valid  <= 1'b0;
+                    cycles <= 0;
+                end
+            end
+
+            // Check response and handle timeout
+            if (ready) begin
+                if (verif_passed === TESTS[curr_rsp_ptr].expect_verif_passed) begin
+                    $display("PASS  [%s] — %0d cycle(s), verif_passed=%0b",
+                             TESTS[curr_rsp_ptr].name, cycles, verif_passed);
+                    pass_count <= pass_count + 1;
+                end else begin
+                    $display("FAIL  [%s] — expected verif_passed=%0b, got %0b",
+                             TESTS[curr_rsp_ptr].name,
+                             TESTS[curr_rsp_ptr].expect_verif_passed, verif_passed);
+                    fail_count <= fail_count + 1;
+                end
+
+                curr_rsp_ptr <= curr_rsp_ptr + 1;
+
+            end else if (valid) begin
+                cycles <= cycles + 1;
+                if (cycles > 10000000) begin
+                    $display("FAIL  [%s] — timeout", TESTS[curr_rsp_ptr].name);
+                    fail_count   <= fail_count + 1;
+                    curr_rsp_ptr <= curr_rsp_ptr + 1;
+                    valid        <= 1'b0;
+                end
+            end
+
+            // All tests complete
+            if (curr_rsp_ptr == NumTests) begin
+                $display("");
+                if (fail_count == 0)
+                    $display("All %0d ECDSA tests passed.", pass_count);
+                else
+                    $display("ECDSA: %0d passed, %0d failed.", pass_count, fail_count);
+                $finish;
+            end
+
+        end
+    end
+
+endmodule
diff --git a/verilog/tb/tb_math_pkg.sv b/verilog/tb/tb_math_pkg.sv
new file mode 100644
index 0000000..a1410dc
--- /dev/null
+++ b/verilog/tb/tb_math_pkg.sv
@@ -0,0 +1,149 @@
+// Modular arithmetic utilities for testbenches (simulation only).
+//
+// All functions operate on WIDTH-bit unsigned integers modulo a prime m.
+// Not synthesisable — uses full-width multiply and Fermat's little theorem.
+
+package tb_math_pkg;
+
+    import arith_pkg::*;
+
+    localparam int W2 = 2 * WIDTH;
+
+    // (a + b) mod m
+    function automatic logic [WIDTH-1:0] mod_add(
+        input logic [WIDTH-1:0] a, b, m
+    );
+        logic [WIDTH:0] sum = {1'b0, a} + {1'b0, b};
+        return WIDTH'(sum >= {1'b0, m} ? sum - {1'b0, m} : sum);
+    endfunction
+
+    // (a - b) mod m
+    function automatic logic [WIDTH-1:0] mod_sub(
+        input logic [WIDTH-1:0] a, b, m
+    );
+        return a >= b ? a - b : m - (b - a);
+    endfunction
+
+    // (a * b) mod m
+    function automatic logic [WIDTH-1:0] mod_mul(
+        input logic [WIDTH-1:0] a, b, m
+    );
+        logic [W2-1:0] product = W2'(a) * W2'(b);
+        return WIDTH'(product % W2'(m));
+    endfunction
+
+    // a^(-1) mod m  — Fermat's little theorem: a^(m-2) mod m
+    function automatic logic [WIDTH-1:0] mod_inv(
+        input logic [WIDTH-1:0] a, m
+    );
+        logic [WIDTH-1:0] exp    = m - 2;
+        logic [WIDTH-1:0] result = 1;
+        logic [WIDTH-1:0] base   = a;
+
+        for (int i = 0; i < WIDTH; i++) begin
+            if (exp[i])
+                result = mod_mul(result, base, m);
+            base = mod_mul(base, base, m);
+        end
+
+        return result;
+    endfunction
+
+    // -------------------------------------------------------------------------
+    // secp256k1 constants
+    // -------------------------------------------------------------------------
+
+    localparam logic [WIDTH-1:0] SECP256K1_P =
+        256'hFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F;
+    localparam logic [WIDTH-1:0] SECP256K1_N =
+        256'hFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141;
+    localparam logic [WIDTH-1:0] SECP256K1_GX =
+        256'h79BE667EF9DCBBAC55A06295CE870B07029BFCDB2DCE28D959F2815B16F81798;
+    localparam logic [WIDTH-1:0] SECP256K1_GY =
+        256'h483ADA7726A3C4655DA4FBFC0E1108A8FD17B448A68554199C47D08FFB10D4B8;
+
+    // -------------------------------------------------------------------------
+    // EC point type (affine).  valid=0 means point at infinity.
+    // -------------------------------------------------------------------------
+
+    typedef struct packed {
+        logic             valid;
+        logic [WIDTH-1:0] x;
+        logic [WIDTH-1:0] y;
+    } ec_point_t;
+
+    localparam ec_point_t EC_INF = '{valid: 1'b0, x: '0, y: '0};
+    localparam ec_point_t EC_G   = '{valid: 1'b1, x: SECP256K1_GX, y: SECP256K1_GY};
+
+    // -------------------------------------------------------------------------
+    // Affine point addition over secp256k1 (mod p)
+    // -------------------------------------------------------------------------
+
+    function automatic ec_point_t ec_add(input ec_point_t p1, input ec_point_t p2);
+        logic [WIDTH-1:0] p, lam, x3, y3;
+        p = SECP256K1_P;
+
+        if (!p1.valid) return p2;
+        if (!p2.valid) return p1;
+
+        if (p1.x == p2.x) begin
+            if (p1.y == p2.y) begin
+                // Doubling: λ = 3x₁² / 2y₁
+                lam = mod_mul(mod_mul(256'd3, mod_mul(p1.x, p1.x, p), p),
+                              mod_inv(mod_add(p1.y, p1.y, p), p), p);
+            end else begin
+                return EC_INF;
+            end
+        end else begin
+            // General: λ = (y₂ - y₁) / (x₂ - x₁)
+            lam = mod_mul(mod_sub(p2.y, p1.y, p), mod_inv(mod_sub(p2.x, p1.x, p), p), p);
+        end
+
+        x3 = mod_sub(mod_sub(mod_mul(lam, lam, p), p1.x, p), p2.x, p);
+        y3 = mod_sub(mod_mul(lam, mod_sub(p1.x, x3, p), p), p1.y, p);
+        return '{valid: 1'b1, x: x3, y: y3};
+    endfunction
+
+    // -------------------------------------------------------------------------
+    // Scalar multiplication: k * P  (double-and-add)
+    // -------------------------------------------------------------------------
+
+    function automatic ec_point_t ec_mul(input logic [WIDTH-1:0] k, input ec_point_t pt);
+        ec_point_t acc = EC_INF;
+        ec_point_t cur = pt;
+
+        for (int i = 0; i < WIDTH; i++) begin
+            if (k[i])
+                acc = ec_add(acc, cur);
+            cur = ec_add(cur, cur);
+        end
+
+        return acc;
+    endfunction
+
+    // -------------------------------------------------------------------------
+    // ECDSA signature type and sign function
+    // -------------------------------------------------------------------------
+
+    typedef struct packed {
+        logic [WIDTH-1:0] r;
+        logic [WIDTH-1:0] s;
+    } ecdsa_sig_t;
+
+    // Sign message hash z with private key d and nonce k
+    function automatic ecdsa_sig_t ecdsa_sign(
+        input logic [WIDTH-1:0] z, d, k
+    );
+        ec_point_t kg;
+        logic [WIDTH-1:0] n, r_val, k_inv, s_val;
+
+        n   = SECP256K1_N;
+        kg  = ec_mul(k, EC_G);
+        r_val = kg.x % n;
+        k_inv = mod_inv(k, n);
+        s_val = mod_mul(k_inv, mod_add(z, mod_mul(r_val, d, n), n), n);
+
+        return '{r: r_val, s: s_val};
+    endfunction
+
+endpackage
diff --git a/verilog/tb/tb_top.sv b/verilog/tb/tb_top.sv
new file mode 100644
index 0000000..1646493
--- /dev/null
+++ b/verilog/tb/tb_top.sv
@@ -0,0 +1,634 @@
+// tb_security_block.sv
+//
+// Nonce derivation:
+//   trng_load_seed=1 fires on the first posedge after rst_n deasserts.
+//   TRNG counter = TRNG_SEED after that edge, then increments.
+//   trng_request_new pulses 100 cycles later (after INIT_DELAY_CYCLES).
+
+`include "tb_math_pkg.sv"
+
+module tb (
+    input logic clk,
+    input logic rst_n
+);
+    import arith_pkg::*;
+    import tb_math_pkg::*;
+
+    // -------------------------------------------------------------------------
+    // DUT signals
+    // -------------------------------------------------------------------------
+
+    logic             license_valid  = 1'b0;
+    logic             license_ready;
+    logic [WIDTH-1:0] license_r      = '0;
+    logic [WIDTH-1:0] license_s      = '0;
+    logic             workload_valid = 1'b0;
+    logic [7:0]       workload_a     = '0;
+    logic [7:0]       workload_b     = '0;
+    logic [WIDTH-1:0] trng_seed      = '0;
+    logic             trng_load_seed = 1'b0;
+
+    logic [WIDTH-1:0] nonce;
+    logic             nonce_ready;
+    logic [7:0]       workload_result;
+    logic             result_valid;
+    logic [63:0]      allowance;
+    logic             enabled;
+
+    security_block u_dut (
+        .clk            (clk),
+        .rst_n          (rst_n),
+        .license_valid  (license_valid),
+        .license_ready  (license_ready),
+        .license_r      (license_r),
+        .license_s      (license_s),
+        .workload_valid (workload_valid),
+        .workload_a     (workload_a),
+        .workload_b     (workload_b),
+        .trng_seed      (trng_seed),
+        .trng_load_seed (trng_load_seed),
+        .nonce          (nonce),
+        .nonce_ready    (nonce_ready),
+        .workload_result(workload_result),
+        .result_valid   (result_valid),
+        .allowance      (allowance),
+        .enabled        (enabled)
+    );
+
+    // -------------------------------------------------------------------------
+    // Constants
+    // -------------------------------------------------------------------------
+
+    localparam logic [WIDTH-1:0] TRNG_SEED = 256'd12345;
+
+    // ECDSA signing: d=2 (Q=2G), k=7
+    localparam logic [WIDTH-1:0] PRIV_KEY = 256'd2;
+    localparam logic [WIDTH-1:0] SIGN_K   = 256'd7;
+
+    localparam int VERIFY_TIMEOUT = 15_000_000;
+    localparam int NONCE_TIMEOUT  = 300;
+
+    // -------------------------------------------------------------------------
+    // TB state machine
+    // -------------------------------------------------------------------------
+
+    typedef enum logic [4:0] {
+        PH_INIT,
+        PH_T1_CHECK,
+        PH_T2_DRIVE,   PH_T2_CHECK,
+        PH_T3_CHECK,
+        PH_T4_SUBMIT,  PH_T4_CHECK,
+        PH_T5_DRIVE,   PH_T5_CHECK,
+        PH_T6_SUBMIT,  PH_T6_CHECK,
+        PH_T7_DRIVE,   PH_T7_CHECK,
+        PH_T8_DRIVE,   PH_T8_CHECK,
+        PH_T9_DRIVE,   PH_T9_CHECK,
+        PH_T10_DRIVE,  PH_T10_CHECK,
+        PH_T11_WAIT,   PH_T11_CHECK,
+        PH_T12_SUBMIT, PH_T12_CHECK,
+        PH_T13_SUBMIT, PH_T13_CHECK,
+        PH_T14_SUBMIT, PH_T14_WAIT, PH_T14_REPLAY, PH_T14_CHECK,
+        PH_DONE
+    } ph_e;
+
+    ph_e         phase;
+    logic        reset_done = 1'b0;
+    int          wait_cnt   = 0;
+    int          pass_count = 0;
+    int          fail_count = 0;
+    logic [63:0]      saved_allow;
+    logic [WIDTH-1:0] saved_nonce;
+    logic [WIDTH-1:0] saved_r;
+    logic [WIDTH-1:0] saved_s;
+
+    // -------------------------------------------------------------------------
+    // Sequencer
+    // -------------------------------------------------------------------------
+
+    // Previous cycle phase register for edge detection
+    ph_e phase_d1;
+    always @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            phase_d1 <= PH_INIT;
+        end else begin
+            phase_d1 <= phase;
+        end
+    end
+
+    // Timeout counter
+    always @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            wait_cnt <= 0;
+        end else if (phase != phase_d1) begin // phase change → reset counter
+            wait_cnt <= 0;
+        end else begin // else increment
+            wait_cnt <= wait_cnt + 1;
+        end
+    end
+
+    // Stimulus and checks - FSM
+
+    // Stimulus driving and checking logic
+    always @(posedge clk or negedge rst_n) begin
+        if (!rst_n) begin
+            phase        <= PH_INIT;
+
+            reset_done     <= 1'b1;
+            trng_seed      <= TRNG_SEED;
+            trng_load_seed <= 1'b1;
+            license_valid  <= 1'b0;
+            license_r      <= '0;
+            license_s      <= '0;
+            workload_valid <= 1'b0;
+            workload_a     <= '0;
+            workload_b     <= '0;
+            pass_count     <= 0;
+            fail_count     <= 0;
+        end else if (reset_done) begin
+
+            case (phase)
+
+                // -------------------------------------------------------
+                PH_INIT: begin
+                    trng_load_seed <= 1'b0;   // one-cycle seed pulse done
+                    phase        <= PH_T1_CHECK;
+                end
+
+                // -------------------------------------------------------
+                // T1: Initial state
+                // -------------------------------------------------------
+                PH_T1_CHECK: begin
+                    if (allowance == '0 && enabled == 1'b0) begin
+                        $display("PASS  [T1  initial state] allowance=0 enabled=0");
+                        pass_count <= pass_count + 1;
+                    end else begin
+                        $display("FAIL  [T1  initial state] allowance=%0d enabled=%0b",
+                                 allowance, enabled);
+                        fail_count <= fail_count + 1;
+                    end
+                    phase <= phase.next();
+                end
+
+                // -------------------------------------------------------
+                // T2: Workload blocked (enabled=0)
+                // -------------------------------------------------------
+                PH_T2_DRIVE: begin
+                    workload_valid <= 1'b1;
+                    workload_a     <= 8'd10;
+                    workload_b     <= 8'd20;
+                    phase          <= PH_T2_CHECK;
+                end
+
+                PH_T2_CHECK: begin
+                    workload_valid <= 1'b0;
+                    if (result_valid) begin
+
+                        // Check
+                        if (workload_result == 8'd0) begin
+                            $display("PASS  [T2  workload blocked] result=0");
+                            pass_count <= pass_count + 1;
+                        end else begin
+                            $display("FAIL  [T2  workload blocked] result=%0d (expected 0)",
+                                    workload_result);
+                            fail_count <= fail_count + 1;
+                        end
+
+                        // Next test
+                        phase <= phase.next();
+                    end
+                end
+
+                // -------------------------------------------------------
+                // T3: nonce_ready asserts correctly
+                // -------------------------------------------------------
+                PH_T3_CHECK: begin
+                    if (nonce_ready) begin
+                        $display("PASS  [T3  nonce_ready] nonce=0x%h", nonce);
+                        pass_count <= pass_count + 1;
+                        phase  <= phase.next();
+                    end else if (wait_cnt > NONCE_TIMEOUT) begin
+                        $fatal("FAIL  [T3  nonce ready] timeout");
+                    end
+                end
+
+                // -------------------------------------------------------
+                // T4: Submit valid license
+                // -------------------------------------------------------
+                PH_T4_SUBMIT: begin
+                    if (nonce_ready) begin
+                        ecdsa_sig_t sig;
+
+                        assert(allowance == 0) else $fatal("Expected allowance=0 at license submission, got %0d", allowance);
+
+                        sig = ecdsa_sign(nonce, PRIV_KEY, SIGN_K);
+                        license_valid  <= 1'b1;
+                        license_r      <= sig.r;
+                        license_s      <= sig.s;
+
+                        phase        <= PH_T4_CHECK;
+                    end
+                end
+
+                PH_T4_CHECK: begin
+
+                    // Hold license_r/s until license_ready pulses
+                    if (license_ready) begin
+                        license_valid <= 1'b0;
+                        license_r     <= '0;
+                        license_s     <= '0;
+                    end
+
+                    // Check allowance afterwards (when valid is back to deasserted)
+                    if (!license_valid) begin
+
+                        if (allowance != '0) begin
+                            $display("PASS  [T4  valid license] allowance incremented to %0d", allowance);
+                            pass_count <= pass_count + 1;
+                        end else begin
+                            $display("FAIL  [T4  valid license] allowance not incremented");
+                            fail_count <= fail_count + 1;
+                        end
+
+                        phase       <= phase.next();
+                    end else if (wait_cnt > VERIFY_TIMEOUT) begin
+                        $fatal("FAIL  [T4  valid license] handshake timeout");
+                    end
+                end
+
+                // -------------------------------------------------------
+                // T5: Workload unblocked — 50 + 30 = 80
+                // -------------------------------------------------------
+                PH_T5_DRIVE: begin
+
+                    assert(allowance != '0) else $fatal("Expected allowance>0 before driving T5, got %0d", allowance);
+                    assert(enabled   ==  1) else $fatal("Expected enabled=1 before driving T5, got %0d", enabled);
+
+                    workload_valid <= 1'b1;
+                    workload_a     <= 8'd50;
+                    workload_b     <= 8'd30;
+                    phase          <= PH_T5_CHECK;
+                end
+
+                PH_T5_CHECK: begin
+                    workload_valid <= 1'b0;
+
+                    if (result_valid) begin
+
+                        // Check
+                        if (workload_result == 8'd80) begin
+                            $display("PASS  [T5  workload unblocked] 50+30=%0d", workload_result);
+                            pass_count <= pass_count + 1;
+                        end else begin
+                            $display("FAIL  [T5  workload unblocked] expected 80 got %0d",
+                                    workload_result);
+                            fail_count <= fail_count + 1;
+                        end
+
+                        // Next test
+                        phase <= phase.next();
+                    end
+                end
+
+                // -------------------------------------------------------
+                // T6: Invalid license — expect rejection, nonce unchanged
+                // Submit VALID_R/VALID_S (valid for z=NONCE_1) against
+                // the current nonce (which is no longer NONCE_1).
+                // -------------------------------------------------------
+                PH_T6_SUBMIT: begin
+                    if (nonce_ready) begin
+                        license_valid <= 1'b1;
+                        license_r     <= 256'd11111;
+                        license_s     <= 256'd22222;
+                        saved_allow   <= allowance;
+                        saved_nonce   <= nonce;
+                        phase         <= PH_T6_CHECK;
+                    end
+                end
+
+                PH_T6_CHECK: begin
+                    // Hold license_r/s until license_ready pulses
+                    if (license_ready) begin
+                        license_valid <= 1'b0;
+                        license_r     <= '0;
+                        license_s     <= '0;
+                    end
+
+                    // Check after deassert
+                    if (!license_valid) begin
+                        if (allowance <= saved_allow && nonce == saved_nonce) begin
+                            $display("PASS  [T6  invalid license] allowance not incremented, nonce unchanged");
+                            pass_count <= pass_count + 1;
+                        end else begin
+                            $display("FAIL  [T6  invalid license] allowance incremented or nonce changed \
+                                     (allowance=%0d, expected <=%0d; nonce=0x%h, expected 0x%h)",
+                                     allowance, saved_allow, nonce, saved_nonce);
+                            fail_count <= fail_count + 1;
+                        end
+                        phase <= phase.next();
+                    end else if (wait_cnt > VERIFY_TIMEOUT) begin
+                        $fatal("FAIL  [T6  invalid license] timeout");
+                    end
+                end
+
+                // -------------------------------------------------------
+                // T7: Workload — 50 + 30 = 80 (positive values)
+                // -------------------------------------------------------
+                PH_T7_DRIVE: begin
+                    workload_valid <= 1'b1;
+                    workload_a     <= 8'd50;
+                    workload_b     <= 8'd30;
+                    phase          <= PH_T7_CHECK;
+                end
+
+                PH_T7_CHECK: begin
+                    workload_valid <= 1'b0;
+
+                    if (result_valid) begin
+
+                        // Check
+                        if (workload_result == 8'd80) begin
+                            $display("PASS  [T7  50+30] result=%0d", workload_result);
+                            pass_count <= pass_count + 1;
+                        end else begin
+                            $display("FAIL  [T7  50+30] expected 80 got %0d", workload_result);
+                            fail_count <= fail_count + 1;
+                        end
+
+                        // Next test
+                        phase <= phase.next();
+                    end
+                end
+
+                // -------------------------------------------------------
+                // T8: -10 + -20 = -30
+                // -------------------------------------------------------
+                PH_T8_DRIVE: begin
+                    workload_valid <= 1'b1;
+                    workload_a     <= 8'hF6;   // -10
+                    workload_b     <= 8'hEC;   // -20
+                    phase          <= PH_T8_CHECK;
+                end
+
+                PH_T8_CHECK: begin
+                    workload_valid <= 1'b0;
+
+                    if (result_valid) begin
+
+                        // Check
+                        if (workload_result == 8'hE2) begin  // -30
+                            $display("PASS  [T8  -10+-20] result=0x%h", workload_result);
+                            pass_count <= pass_count + 1;
+                        end else begin
+                            $display("FAIL  [T8  -10+-20] expected 0xE2 got 0x%h", workload_result);
+                            fail_count <= fail_count + 1;
+                        end
+
+                        // Next test
+                        phase <= phase.next();
+                    end
+                end
+
+                // -------------------------------------------------------
+                // T9: 100 + -30 = 70
+                // -------------------------------------------------------
+                PH_T9_DRIVE: begin
+                    workload_valid <= 1'b1;
+                    workload_a     <= 8'd100;
+                    workload_b     <= 8'hE2;   // -30
+                    phase          <= PH_T9_CHECK;
+                end
+
+                PH_T9_CHECK: begin
+                    workload_valid <= 1'b0;
+
+                    if (result_valid) begin
+
+                        // Check
+                        if (workload_result == 8'd70) begin
+                            $display("PASS  [T9  100+-30] result=%0d", workload_result);
+                            pass_count <= pass_count + 1;
+                        end else begin
+                            $display("FAIL  [T9  100+-30] expected 70 got %0d", workload_result);
+                            fail_count <= fail_count + 1;
+                        end
+
+                        // Next test
+                        phase <= phase.next();
+                    end
+                end
+
+                // -------------------------------------------------------
+                // T10: 127 + 1 = -128 (overflow wrapping)
+                // -------------------------------------------------------
+                PH_T10_DRIVE: begin
+                    workload_valid <= 1'b1;
+                    workload_a     <= 8'd127;
+                    workload_b     <= 8'd1;
+                    phase          <= PH_T10_CHECK;
+                end
+
+                PH_T10_CHECK: begin
+                    workload_valid <= 1'b0;
+
+                    if (result_valid) begin
+
+                        // Check
+                        if (workload_result == 8'h80) begin  // -128
+                            $display("PASS  [T10 127+1 overflow] result=0x%h", workload_result);
+                            pass_count <= pass_count + 1;
+                        end else begin
+                            $display("FAIL  [T10 127+1 overflow] expected 0x80 got 0x%h", workload_result);
+                            fail_count <= fail_count + 1;
+                        end
+
+                        // Next test
+                        phase       <= phase.next();
+                    end
+                end
+
+                // -------------------------------------------------------
+                // T11: Allowance decrements by 1 per cycle
+                // -------------------------------------------------------
+                PH_T11_WAIT: begin
+                    if (wait_cnt == 0)
+                        saved_allow <= allowance; // capture starting allowance at beginning of wait
+                    if (wait_cnt == 100)
+                        phase <= phase.next();
+                end
+
+                PH_T11_CHECK: begin
+                    if ( allowance >= (saved_allow - 105) &&
+                            allowance <= (saved_allow - 95) ) begin
+                        $display("PASS  [T11 allowance decrement] delta=%0d over ~100 cycles", saved_allow - allowance);
+                        pass_count <= pass_count + 1;
+                    end else begin
+                        $display("FAIL  [T11 allowance decrement] delta=%0d, expected ~100", saved_allow - allowance);
+                        fail_count <= fail_count + 1;
+                    end
+                    phase <= phase.next();
+                end
+
+                // -------------------------------------------------------
+                // T12: New nonce after valid license
+                //   Sign the current nonce dynamically, submit, check
+                //   that the nonce changes afterwards.
+                // -------------------------------------------------------
+                PH_T12_SUBMIT: begin
+                    if (nonce_ready) begin
+                        ecdsa_sig_t sig;
+                        sig = ecdsa_sign(nonce, PRIV_KEY, SIGN_K);
+                        license_valid <= 1'b1;
+                        license_r     <= sig.r;
+                        license_s     <= sig.s;
+                        saved_nonce   <= nonce;
+                        phase         <= PH_T12_CHECK;
+                    end
+                end
+
+                PH_T12_CHECK: begin
+                    // Hold license_r/s until license_ready pulses
+                    if (license_ready) begin
+                        license_valid <= 1'b0;
+                        license_r     <= '0;
+                        license_s     <= '0;
+                    end
+
+                    // Check when next nonce is ready (after handshake completed)
+                    if (!license_valid && nonce_ready) begin
+                        if (nonce != saved_nonce) begin
+                            $display("PASS  [T12 new nonce] nonce changed");
+                            pass_count <= pass_count + 1;
+                        end else begin
+                            $display("FAIL  [T12 new nonce] nonce unchanged");
+                            fail_count <= fail_count + 1;
+                        end
+                        phase <= phase.next();
+                    end else if (wait_cnt > VERIFY_TIMEOUT) begin
+                        $fatal("FAIL  [T12 new nonce] timeout");
+                    end
+                end
+
+                // -------------------------------------------------------
+                // T13: License signed for wrong nonce is rejected
+                //   Sign a wrong nonce (9999), submit against the
+                //   current nonce. Expect rejection.
+                // -------------------------------------------------------
+                PH_T13_SUBMIT: begin
+                    if (nonce_ready) begin
+                        ecdsa_sig_t sig;
+                        sig = ecdsa_sign(256'd9999, PRIV_KEY, SIGN_K);
+                        license_valid <= 1'b1;
+                        license_r     <= sig.r;
+                        license_s     <= sig.s;
+                        saved_allow   <= allowance;
+                        saved_nonce   <= nonce;
+                        phase         <= PH_T13_CHECK;
+                    end
+                end
+
+                PH_T13_CHECK: begin
+                    // Hold license_r/s until license_ready pulses
+                    if (license_ready) begin
+                        license_valid <= 1'b0;
+                        license_r     <= '0;
+                        license_s     <= '0;
+                    end
+
+                    // Check after deassert
+                    if (!license_valid) begin
+                        if (allowance <= saved_allow && nonce == saved_nonce) begin
+                            $display("PASS  [T13 wrong nonce] allowance not incremented, nonce unchanged");
+                            pass_count <= pass_count + 1;
+                        end else begin
+                            $display("FAIL  [T13 wrong nonce] allowance incremented or nonce changed \
+                                     (allowance=%0d, expected <=%0d; nonce=0x%h, expected 0x%h)",
+                                     allowance, saved_allow, nonce, saved_nonce);
+                            fail_count <= fail_count + 1;
+                        end
+                        phase <= phase.next();
+                    end else if (wait_cnt > VERIFY_TIMEOUT) begin
+                        $fatal("FAIL  [T13 wrong nonce] timeout");
+                    end
+                end
+
+                // -------------------------------------------------------
+                // T14: Replay attack — submit same signature twice
+                //   First submission is valid (signs current nonce),
+                //   second reuses the same (r, s) against a new nonce.
+                // -------------------------------------------------------
+                PH_T14_SUBMIT: begin
+                    if (nonce_ready) begin
+                        ecdsa_sig_t sig;
+                        sig = ecdsa_sign(nonce, PRIV_KEY, SIGN_K);
+                        license_valid <= 1'b1;
+                        license_r     <= sig.r;
+                        license_s     <= sig.s;
+                        saved_r       <= sig.r;
+                        saved_s       <= sig.s;
+                        phase         <= PH_T14_WAIT;
+                    end
+                end
+
+                PH_T14_WAIT: begin
+                    // Hold license_r/s until license_ready pulses
+                    if (license_ready) begin
+                        license_valid <= 1'b0;
+                        license_r     <= '0;
+                        license_s     <= '0;
+                        phase <= PH_T14_REPLAY;
+                    end
+                end
+
+                PH_T14_REPLAY: begin
+                    if (nonce_ready) begin
+                        // Replay the saved signature against the new nonce
+                        license_valid <= 1'b1;
+                        license_r     <= saved_r;
+                        license_s     <= saved_s;
+                        saved_allow   <= allowance;
+                        saved_nonce   <= nonce;
+                        phase         <= PH_T14_CHECK;
+                    end
+                end
+
+                PH_T14_CHECK: begin
+                    // Hold license_r/s until license_ready pulses
+                    if (license_ready) begin
+                        license_valid <= 1'b0;
+                        license_r     <= '0;
+                        license_s     <= '0;
+                    end
+
+                    // Check after deassert
+                    if (!license_valid) begin
+                        if (allowance <= saved_allow && nonce == saved_nonce) begin
+                            $display("PASS  [T14 replay attack] allowance not incremented, nonce unchanged");
+                            pass_count <= pass_count + 1;
+                        end else begin
+                            $display("FAIL  [T14 replay attack] allowance incremented or nonce changed \
+                                     (allowance=%0d, expected <=%0d; nonce=0x%h, expected 0x%h)",
+                                     allowance, saved_allow, nonce, saved_nonce);
+                            fail_count <= fail_count + 1;
+                        end
+                        phase <= phase.next();
+                    end else if (wait_cnt > VERIFY_TIMEOUT) begin
+                        $fatal("FAIL  [T14 replay attack] timeout");
+                    end
+                end
+
+                // -------------------------------------------------------
+                PH_DONE: begin
+                    $display("");
+                    if (fail_count == 0)
+                        $display("All %0d security_block tests passed.", pass_count);
+                    else
+                        $display("security_block: %0d passed, %0d FAILED.", pass_count, fail_count);
+                    $finish;
+                end
+
+                default: ;
+            endcase
+        end
+    end
+
+endmodule